gemma4: add Gemma 4 GGML model support

Add full Gemma 4 model family support (E2B, E4B, 26B MoE, 31B Dense) for the GGML backend including text, vision, converter, parser, and renderer. Text model features: - Sliding window + full attention with per-layer patterns - KV sharing across layers with donor map - Per-layer embeddings (PLE) with learned projections - MoE routing with RMSNorm + learned scale - Proportional RoPE with freq_factors for global attention - Final logit softcapping Vision model features: - SigLIP vision encoder with 2D RoPE - ClippableLinear with input/output clamping via packed v.clamp_data - Adaptive average pooling with nMerge kernel - Multi-modal projection with unweighted RMSNorm Converter: - Safetensors to GGUF with vision tensor renaming - Fused MoE gate_up_proj splitting - Vision patch embedding reshape (HF to Conv2D layout) - Packed clamp data tensor for ClippableLinear bounds - Proportional RoPE freq_factors generation Also includes: - BackendGet() on ml.Tensor for reading weight tensor data - Q6_K CUDA get_rows kernel support - MoE-aware ffn_down quantization layer counting - Gemma4 parser with tool calling and thinking support - Gemma4 renderer with structured tool format - Architecture-based auto-detection of renderer/parser/stop tokens - Integration test gemma4 model list additions
2026-04-23 08:45:14 +00:00 · 2026-03-30 17:38:56 -07:00 · 2026-03-30 17:38:56 -07:00 · ea3c6a3cbe
parent f6b69f3f28
commit ea3c6a3cbe
20 changed files with 2273 additions and 8 deletions
--- a/convert/convert.go
+++ b/convert/convert.go
@ -290,6 +290,8 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
 		conv = &gemma3Model{Architecture: p.Architectures[0]}
 	case "Gemma3nForConditionalGeneration":
 		conv = &gemma3nModel{}
+	case "Gemma4ForCausalLM", "Gemma4ForConditionalGeneration":
+		conv = &gemma4Model{Architecture: p.Architectures[0]}
 	case "Phi3ForCausalLM":
 		conv = &phi3Model{}
 	case "Qwen2ForCausalLM":
--- a/convert/convert_gemma4.go
+++ b/convert/convert_gemma4.go
@ -0,0 +1,514 @@
+package convert
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"slices"
+	"strings"
+
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/pdevine/tensor"
+	"github.com/pdevine/tensor/native"
+)
+
+type gemma4Model struct {
+	gemmaModel
+	Architecture string
+	TextModel    struct {
+		HiddenSize              uint32   `json:"hidden_size"`
+		NumHiddenLayers         uint32   `json:"num_hidden_layers"`
+		IntermediateSize        uint32   `json:"intermediate_size"`
+		NumAttentionHeads       uint32   `json:"num_attention_heads"`
+		NumKeyValueHeads        uint32   `json:"num_key_value_heads"`
+		HeadDim                 uint32   `json:"head_dim"`
+		GlobalHeadDim           uint32   `json:"global_head_dim"`
+		VocabSize               uint32   `json:"vocab_size"`
+		RMSNormEps              float32  `json:"rms_norm_eps"`
+		MaxPositionEmbeddings   uint32   `json:"max_position_embeddings"`
+		SlidingWindow           uint32   `json:"sliding_window"`
+		SlidingWindowPattern    *int32   `json:"_sliding_window_pattern"`
+		LayerTypes              []string `json:"layer_types"`
+		FinalLogitSoftcapping   float32  `json:"final_logit_softcapping"`
+		EnableMoeBlock          bool     `json:"enable_moe_block"`
+		NumExperts              *uint32  `json:"num_experts"`
+		TopKExperts             *uint32  `json:"top_k_experts"`
+		ExpertIntermediateSize  *uint32  `json:"expert_intermediate_size"`
+		HiddenSizePerLayerInput *uint32  `json:"hidden_size_per_layer_input"`
+		NumKVSharedLayers       uint32   `json:"num_kv_shared_layers"`
+		AttentionKEqV           bool     `json:"attention_k_eq_v"`
+		NumGlobalKeyValueHeads  *uint32  `json:"num_global_key_value_heads"`
+		QueryPreAttnScalar      *uint32  `json:"query_pre_attn_scalar"`
+		UseDoubleWideMLP        bool     `json:"use_double_wide_mlp"`
+		RopeParameters          map[string]*struct {
+			RopeTheta           float32  `json:"rope_theta"`
+			PartialRotaryFactor *float32 `json:"partial_rotary_factor"`
+		} `json:"rope_parameters"`
+	} `json:"text_config"`
+
+	VisionModel struct {
+		HiddenSize        uint32  `json:"hidden_size"`
+		NumHiddenLayers   uint32  `json:"num_hidden_layers"`
+		NumAttentionHeads uint32  `json:"num_attention_heads"`
+		IntermediateSize  uint32  `json:"intermediate_size"`
+		PatchSize         uint32  `json:"patch_size"`
+		NumChannels       uint32  `json:"num_channels"`
+		PoolingKernelSize uint32  `json:"pooling_kernel_size"`
+		LayerNormEps      float32 `json:"layer_norm_eps"`
+	} `json:"vision_config"`
+}
+
+func (p *gemma4Model) KV(t *Tokenizer) KV {
+	kv := p.ModelParameters.KV(t)
+	kv["general.architecture"] = "gemma4"
+	kv["tokenizer.ggml.model"] = "llama"
+	kv["tokenizer.ggml.pre"] = "gemma4"
+
+	tc := p.TextModel
+
+	kv["gemma4.block_count"] = tc.NumHiddenLayers
+	kv["gemma4.embedding_length"] = tc.HiddenSize
+
+	// Per-layer FFN width: when use_double_wide_mlp is set, KV-shared layers get 2x FFN width.
+	if tc.UseDoubleWideMLP && tc.NumKVSharedLayers > 0 {
+		firstShared := int(tc.NumHiddenLayers) - int(tc.NumKVSharedLayers)
+		ffnWidths := make([]int32, tc.NumHiddenLayers)
+		for i := range ffnWidths {
+			if i >= firstShared {
+				ffnWidths[i] = int32(tc.IntermediateSize * 2)
+			} else {
+				ffnWidths[i] = int32(tc.IntermediateSize)
+			}
+		}
+		kv["gemma4.feed_forward_length"] = ffnWidths
+	} else {
+		kv["gemma4.feed_forward_length"] = tc.IntermediateSize
+	}
+	kv["gemma4.context_length"] = tc.MaxPositionEmbeddings
+	kv["gemma4.attention.head_count"] = tc.NumAttentionHeads
+	// Per-layer KV head count array: SWA layers use NumKeyValueHeads, global layers use NumGlobalKeyValueHeads
+	if tc.NumGlobalKeyValueHeads != nil && *tc.NumGlobalKeyValueHeads != tc.NumKeyValueHeads && len(tc.LayerTypes) > 0 {
+		kvHeads := make([]int32, len(tc.LayerTypes))
+		for i, lt := range tc.LayerTypes {
+			if lt == "sliding_attention" {
+				kvHeads[i] = int32(tc.NumKeyValueHeads)
+			} else {
+				kvHeads[i] = int32(*tc.NumGlobalKeyValueHeads)
+			}
+		}
+		kv["gemma4.attention.head_count_kv"] = kvHeads
+	} else {
+		kv["gemma4.attention.head_count_kv"] = tc.NumKeyValueHeads
+	}
+	// key_length = global head dim, key_length_swa = local (SWA) head dim
+	kv["gemma4.attention.key_length"] = tc.GlobalHeadDim
+	kv["gemma4.attention.value_length"] = tc.GlobalHeadDim
+	kv["gemma4.attention.key_length_swa"] = tc.HeadDim
+	kv["gemma4.attention.value_length_swa"] = tc.HeadDim
+	kv["gemma4.attention.layer_norm_rms_epsilon"] = tc.RMSNormEps
+	kv["gemma4.attention.sliding_window"] = tc.SlidingWindow
+
+	// Sliding window pattern from layer_types
+	if len(tc.LayerTypes) > 0 {
+		kv["gemma4.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
+			for _, lt := range tc.LayerTypes {
+				if !yield(lt == "sliding_attention") {
+					break
+				}
+			}
+		})
+	}
+
+	kv["gemma4.attention.shared_kv_layers"] = tc.NumKVSharedLayers
+
+	// RoPE: dimension_count is the full global head dim (freq_factors handle partial rotation)
+	if rp, ok := tc.RopeParameters["full_attention"]; ok && rp != nil {
+		kv["gemma4.rope.freq_base"] = rp.RopeTheta
+		kv["gemma4.rope.dimension_count"] = tc.GlobalHeadDim
+	}
+	if rp, ok := tc.RopeParameters["sliding_attention"]; ok && rp != nil {
+		kv["gemma4.rope.freq_base_swa"] = rp.RopeTheta
+		kv["gemma4.rope.dimension_count_swa"] = tc.HeadDim
+	}
+
+	if tc.FinalLogitSoftcapping > 0 {
+		kv["gemma4.final_logit_softcapping"] = tc.FinalLogitSoftcapping
+	}
+
+	// MoE
+	if tc.EnableMoeBlock && tc.NumExperts != nil {
+		kv["gemma4.expert_count"] = *tc.NumExperts
+		if tc.TopKExperts != nil {
+			kv["gemma4.expert_used_count"] = *tc.TopKExperts
+		}
+		if tc.ExpertIntermediateSize != nil {
+			kv["gemma4.expert_feed_forward_length"] = *tc.ExpertIntermediateSize
+		}
+	}
+
+	// PLE — always emit, even when 0
+	pleSize := uint32(0)
+	if tc.HiddenSizePerLayerInput != nil {
+		pleSize = *tc.HiddenSizePerLayerInput
+	}
+	kv["gemma4.embedding_length_per_layer_input"] = pleSize
+
+	// Vision model KV metadata
+	vc := p.VisionModel
+	if vc.NumHiddenLayers > 0 {
+		kv["gemma4.vision.block_count"] = vc.NumHiddenLayers
+		kv["gemma4.vision.embedding_length"] = vc.HiddenSize
+		kv["gemma4.vision.attention.head_count"] = vc.NumAttentionHeads
+		kv["gemma4.vision.feed_forward_length"] = vc.IntermediateSize
+		kv["gemma4.vision.patch_size"] = vc.PatchSize
+		numCh := vc.NumChannels
+		if numCh == 0 {
+			numCh = 3
+		}
+		kv["gemma4.vision.num_channels"] = numCh
+		nMerge := vc.PoolingKernelSize
+		if nMerge == 0 {
+			nMerge = 3
+		}
+		kv["gemma4.vision.projector.scale_factor"] = nMerge
+		eps := vc.LayerNormEps
+		if eps == 0 {
+			eps = 1e-6
+		}
+		kv["gemma4.vision.attention.layer_norm_epsilon"] = eps
+	}
+
+	return kv
+}
+
+func (p *gemma4Model) Tensors(ts []Tensor) []*ggml.Tensor {
+	// First pass: collect vision clamp scalar values into a packed tensor.
+	// Layout: per vision layer (0..N-1), 7 linears (q,k,v,out,gate,up,down) × 4 values (inMin,inMax,outMin,outMax).
+	// Then 4 values for the projector (mm.input_projection).
+	clampSuffixes := []string{".input_min", ".input_max", ".output_min", ".output_max"}
+	clampMap := make(map[string]float32)
+	for _, t := range ts {
+		name := t.Name()
+		for _, sfx := range clampSuffixes {
+			if strings.HasSuffix(name, sfx) && (strings.Contains(name, "vision_tower") || strings.Contains(name, "embed_vision")) {
+				var buf bytes.Buffer
+				t.WriteTo(&buf)
+				data := buf.Bytes()
+				if len(data) >= 4 {
+					clampMap[name] = math.Float32frombits(uint32(data[0]) | uint32(data[1])<<8 | uint32(data[2])<<16 | uint32(data[3])<<24)
+				}
+			}
+		}
+	}
+
+	var out []*ggml.Tensor
+	for _, t := range ts {
+		name := t.Name()
+
+		// Skip audio tensors (vision is now handled)
+		if strings.Contains(name, "audio_tower") || strings.Contains(name, "embed_audio") {
+			continue
+		}
+
+		// Skip embedding_post_projection_norm — used as weightless RMS norm in inference
+		if strings.Contains(name, "embedding_post_projection_norm") {
+			continue
+		}
+
+		// Skip clippable linear clamp scalars — packed into v.clamp_data below
+		if strings.Contains(name, "input_min") || strings.Contains(name, "input_max") ||
+			strings.Contains(name, "output_min") || strings.Contains(name, "output_max") {
+			continue
+		}
+
+		// Vision tensor renaming: match published mmproj GGUF names
+		if strings.HasPrefix(name, "v.blk.") {
+			name = strings.Replace(name, ".attn_norm.", ".ln1.", 1)
+			name = strings.Replace(name, ".ffn_norm.", ".ln2.", 1)
+			name = strings.Replace(name, ".attn_output.", ".attn_out.", 1)
+			name = strings.Replace(name, ".post_attention_norm.", ".attn_post_norm.", 1)
+			name = strings.Replace(name, ".post_ffw_norm.", ".ffn_post_norm.", 1)
+			name = strings.Replace(name, ".layer_output_scale.", ".out_scale.", 1)
+		}
+
+		// Audio tensor post-processing: block-level norm rename and per_dim_scale softplus.
+		if strings.HasPrefix(name, "a.blk.") {
+			// Conformer block final norm: a.blk.N.norm.weight → a.blk.N.layer_pre_norm.weight
+			if dotIdx := strings.Index(name[6:], "."); dotIdx >= 0 {
+				rest := name[6+dotIdx+1:]
+				if strings.HasPrefix(rest, "norm.") {
+					name = name[:6+dotIdx+1] + "layer_pre_norm." + rest[5:]
+				}
+			}
+		}
+
+		// per_dim_scale / per_dim_k_scale: apply softplus to weight data and add .weight suffix.
+		if strings.HasPrefix(name, "a.blk.") &&
+			(strings.HasSuffix(name, "per_dim_scale") || strings.HasSuffix(name, "per_dim_k_scale")) {
+			name = name + ".weight"
+			t.SetRepacker(softplusRepacker)
+		}
+
+		// Depthwise conv1d: squeeze middle dimension [C, 1, K] → [C, K].
+		if strings.HasPrefix(name, "a.blk.") && strings.Contains(name, "conv_dw") && strings.HasSuffix(name, ".weight") {
+			t.SetRepacker(squeezeMiddleDim)
+		}
+
+		shape := t.Shape()
+
+		// Convert scalar tensors (input_min/max, output_min/max) to 1D
+		if len(shape) == 0 {
+			shape = []uint64{1}
+		}
+
+		// Fused MoE gate_up_proj: split [experts, 2*intermediate, hidden] into separate gate and up.
+		// No transpose needed — the split shape [experts, intermediate, hidden] already matches
+		// the GGUF layout after the framework's dimension reversal (ne[0]=hidden matches input).
+		if strings.Contains(name, "moe.gate_up_proj") && len(shape) == 3 {
+			halfDim := int(shape[1]) / 2
+			newShape := slices.Clone(shape)
+			newShape[1] = newShape[1] / 2
+			for i, ggufName := range []string{"ffn_gate_exps.weight", "ffn_up_exps.weight"} {
+				tt := t.Clone()
+				tt.SetRepacker(p.sliceExperts(tensor.S(i*halfDim, (i+1)*halfDim)))
+				out = append(out, &ggml.Tensor{
+					Name:     strings.ReplaceAll(name, "moe.gate_up_proj", ggufName),
+					Kind:     tt.Kind(),
+					Shape:    slices.Clone(newShape),
+					WriterTo: tt,
+				})
+			}
+			continue
+		}
+
+		// MoE expert weights: no transpose needed. Safetensors stores [experts, out, in]
+		// which the framework reverses to GGUF ne=[in, out, experts], matching ggml_mul_mat_id.
+		// (transposeExperts was incorrectly swapping dims — removed)
+
+		// Vision patch embedding: reshape from [n_embd, ksize_sq_c] to [n_embd, 3, patch_size, patch_size]
+		// Must be stored as F16 (not BF16) because the Conv2D im2col kernel requires F16/F32.
+		var kindOverride *uint32
+		if strings.Contains(name, "v.patch_embd.weight") && len(shape) == 2 {
+			nEmbd := shape[0]
+			patchSize := uint64(p.VisionModel.PatchSize)
+			if patchSize == 0 {
+				patchSize = 16
+			}
+			numCh := uint64(p.VisionModel.NumChannels)
+			if numCh == 0 {
+				numCh = 3
+			}
+			t.SetRepacker(p.reshapePatchEmbed)
+			shape = []uint64{nEmbd, numCh, patchSize, patchSize}
+			f16Kind := uint32(1) // tensorKindFP16
+			kindOverride = &f16Kind
+		}
+
+		// Vision position embedding: keep 3D [2, maxPos, nEmbd] — matching published mmproj format.
+		// The framework reverses shape to GGUF ne=[nEmbd, maxPos, 2]. No data repacking needed.
+
+		kind := t.Kind()
+		if kindOverride != nil {
+			kind = *kindOverride
+		}
+		out = append(out, &ggml.Tensor{
+			Name:     name,
+			Kind:     kind,
+			Shape:    shape,
+			WriterTo: t,
+		})
+	}
+
+	// Generate a single global rope_freqs.weight for proportional RoPE on global attention layers.
+	// This matches the published GGUF format: one global tensor shared by all layers.
+	// Global layers use partial_rotary_factor (0.25) — only rotate that fraction of dims.
+	// Dimensions beyond the rotated portion get freq_factor=1e30 (effectively no rotation).
+	tc := p.TextModel
+	if tc.GlobalHeadDim > 0 {
+		globalFreqsSize := tc.GlobalHeadDim / 2 // freq_factors are per dimension pair
+
+		// Compute number of rotated pairs for global layers
+		partialRotaryFactor := float32(0.25) // default
+		if rp, ok := tc.RopeParameters["full_attention"]; ok && rp != nil && rp.PartialRotaryFactor != nil {
+			partialRotaryFactor = *rp.PartialRotaryFactor
+		}
+		nRotFull := int(float32(tc.GlobalHeadDim) * partialRotaryFactor / 2)
+
+		freqs := make(ropeFactor, globalFreqsSize)
+		for j := range freqs {
+			if j < nRotFull {
+				freqs[j] = 1.0
+			} else {
+				freqs[j] = 1e30 // effectively disable rotation
+			}
+		}
+		out = append(out, &ggml.Tensor{
+			Name:     "rope_freqs.weight",
+			Kind:     0, // F32
+			Shape:    []uint64{uint64(len(freqs))},
+			WriterTo: freqs,
+		})
+	}
+
+	// Emit packed vision clamp data as a single F32 tensor.
+	// Layout: numLayers × 7 linears (q,k,v,out,gate,up,down) × 4 floats (inMin,inMax,outMin,outMax)
+	// then 4 floats for the projector. Total = (numLayers*7 + 1) * 4 floats.
+	if len(clampMap) > 0 {
+		numLayers := int(p.VisionModel.NumHiddenLayers)
+		linearNames := []string{"attn_q", "attn_k", "attn_v", "attn_out", "ffn_gate", "ffn_up", "ffn_down"}
+		suffixes := []string{".input_min", ".input_max", ".output_min", ".output_max"}
+
+		totalFloats := (numLayers*len(linearNames) + 1) * 4 // +1 for projector
+		clampData := make([]float32, totalFloats)
+
+		for layer := range numLayers {
+			for li, ln := range linearNames {
+				for si, sfx := range suffixes {
+					sfxMap := map[string]string{"attn_q": "q_proj", "attn_k": "k_proj", "attn_v": "v_proj", "attn_out": "o_proj", "ffn_gate": "gate_proj", "ffn_up": "up_proj", "ffn_down": "down_proj"}
+					for origName, val := range clampMap {
+						if strings.Contains(origName, fmt.Sprintf("layers.%d.", layer)) && strings.HasSuffix(origName, sfx) && strings.Contains(origName, sfxMap[ln]) {
+							idx := (layer*len(linearNames)+li)*4 + si
+							clampData[idx] = val
+							break
+						}
+					}
+				}
+			}
+		}
+		// Projector clamp values
+		projIdx := numLayers * len(linearNames) * 4
+		for si, sfx := range suffixes {
+			for origName, val := range clampMap {
+				if strings.Contains(origName, "input_projection") && strings.HasSuffix(origName, sfx) {
+					clampData[projIdx+si] = val
+					break
+				}
+			}
+		}
+
+		var buf bytes.Buffer
+		binary.Write(&buf, binary.LittleEndian, clampData)
+		out = append(out, &ggml.Tensor{
+			Name:     "v.clamp_data",
+			Kind:     0, // F32
+			Shape:    []uint64{uint64(totalFloats)},
+			WriterTo: &buf,
+		})
+	}
+
+	return out
+}
+
+// reshapePatchEmbed reshapes the vision patch embedding from HF layout [n_embd, ksize*ksize*channels]
+// to GGUF layout [n_embd, channels, patch_size, patch_size].
+func (*gemma4Model) reshapePatchEmbed(_ string, data []float32, shape []uint64) ([]float32, error) {
+	if len(shape) != 2 {
+		return data, nil
+	}
+	nEmbd := int(shape[0])
+	ksqC := int(shape[1])
+	nChannels := 3
+	patchSize := int(math.Sqrt(float64(ksqC / nChannels)))
+
+	// HF layout: [n_embd, patch_size * patch_size * channels] (row-major)
+	// Need: [n_embd, channels, patch_size, patch_size]
+	result := make([]float32, len(data))
+	for e := range nEmbd {
+		for c := range nChannels {
+			for h := range patchSize {
+				for w := range patchSize {
+					srcIdx := e*ksqC + h*patchSize*nChannels + w*nChannels + c
+					dstIdx := e*nChannels*patchSize*patchSize + c*patchSize*patchSize + h*patchSize + w
+					result[dstIdx] = data[srcIdx]
+				}
+			}
+		}
+	}
+	shape[0] = uint64(nEmbd)
+	shape[1] = uint64(nChannels * patchSize * patchSize)
+	return result, nil
+}
+
+// sliceExperts returns a repacker that slices dim 1 of a 3D expert tensor.
+// Used for splitting fused gate_up_proj into separate gate and up tensors.
+func (*gemma4Model) sliceExperts(dim1Slice tensor.Slice) Repacker {
+	return func(_ string, data []float32, shape []uint64) ([]float32, error) {
+		dims := make([]int, len(shape))
+		for i, d := range shape {
+			dims[i] = int(d)
+		}
+
+		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+		t, err := t.Slice(nil, dim1Slice)
+		if err != nil {
+			return nil, err
+		}
+
+		t = tensor.Materialize(t)
+		if err := t.Reshape(t.Shape().TotalSize()); err != nil {
+			return nil, err
+		}
+
+		return native.VectorF32(t.(*tensor.Dense))
+	}
+}
+
+func (p *gemma4Model) Replacements() []string {
+	return []string{
+		// Vision ClippableLinear wraps nn.Linear — strip .linear. from weight path
+		".linear.weight", ".weight",
+
+		// Vision encoder
+		"model.vision_tower.encoder.layers", "v.blk",
+		"model.vision_tower.patch_embedder.input_proj", "v.patch_embd",
+		"model.vision_tower.patch_embedder.position_embedding_table", "v.position_embd.weight",
+
+		// Multimodal projector
+		"model.embed_vision.embedding_projection", "mm.input_projection",
+
+		// Text model
+		"model.language_model.embed_tokens_per_layer", "per_layer_token_embd",
+		"model.language_model.embed_tokens", "token_embd",
+		"model.language_model.per_layer_model_projection", "per_layer_model_proj",
+		"model.language_model.per_layer_projection_norm", "per_layer_proj_norm",
+		"model.language_model.norm", "output_norm",
+		"model.language_model.layers", "blk",
+
+		// Shared attention replacements (work for both text and vision tensors)
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.q_norm", "attn_q_norm",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.k_norm", "attn_k_norm",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.down_proj", "ffn_down",
+		"mlp.up_proj", "ffn_up",
+
+		// Post norms
+		"post_attention_layernorm", "post_attention_norm",
+		"pre_feedforward_layernorm_2", "pre_ffw_norm_2",
+		"pre_feedforward_layernorm", "ffn_norm",
+		"post_feedforward_layernorm_1", "post_ffw_norm_1",
+		"post_feedforward_layernorm_2", "post_ffw_norm_2",
+		"post_feedforward_layernorm", "post_ffw_norm",
+
+		// PLE
+		"per_layer_input_gate", "inp_gate",
+		"per_layer_projection", "proj",
+		"post_per_layer_input_norm", "post_norm",
+
+		// MoE
+		"router.proj", "ffn_gate_inp",
+		"router.scale", "ffn_gate_inp.scale",
+		"moe.gate_proj", "ffn_gate_exps.weight",
+		"moe.up_proj", "ffn_up_exps.weight",
+		"moe.down_proj", "ffn_down_exps.weight",
+		"moe.per_expert_scale", "ffn_down_exps.scale",
+
+		// Layer scalar
+		"layer_scalar", "layer_output_scale.weight",
+	}
+}
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@ -205,8 +205,8 @@ func TestConvertInvalidDatatype(t *testing.T) {
 	generateSafetensorTestData(t, tempDir, td)

 	err = ConvertModel(os.DirFS(tempDir), f)
-	if err == nil || err.Error() != "unsupported safetensors model" {
-		t.Errorf("expected error but didn't get one")
+	if err == nil || !strings.Contains(err.Error(), "unknown data type") {
+		t.Errorf("expected 'unknown data type' error but got: %v", err)
 	}
 }

--- a/convert/reader.go
+++ b/convert/reader.go
@ -44,6 +44,7 @@ func (t tensorBase) Kind() uint32 {
 		strings.HasSuffix(t.name, ".ssm_conv1d.weight") || // SSM conv kernel must be F32 for Metal
 		t.name == "token_types.weight" ||
 		t.name == "v.positional_embedding_vlm" ||
+		t.name == "v.position_embd.weight" ||
 		t.name == "v.tile_position_embd.weight" ||
 		t.name == "v.pre_tile_position_embd.weight" ||
 		t.name == "v.post_tile_position_embd.weight" ||
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@ -5,7 +5,6 @@ import (
 	"bytes"
 	"encoding/binary"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
 	"io/fs"
@ -53,9 +52,10 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T

 		for _, key := range keys {
 			if value := headers[key]; value.Type != "" {
-				// bitsandbytes quantized models are unsupported
+				// Scalar tensors (e.g. clipped linear min/max) are 0-dim in safetensors.
+				// Promote them to 1-dim so they can be stored in GGUF.
 				if len(value.Shape) == 0 {
-					return nil, errors.New("unsupported safetensors model")
+					value.Shape = []uint64{1}
 				}
 				ggufName := replacer.Replace(key)
 				if _, ok := names[ggufName]; ok {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -281,6 +281,7 @@ func (kv KV) OllamaEngineRequired() bool {
 		"deepseekocr",
 		"gemma3",
 		"gemma3n",
+		"gemma4",
 		"gptoss", "gpt-oss",
 		"llama4",
 		"mistral3",
--- a/llama/patches/0035-CUDA-get_rows-q6_k-support.patch
+++ b/llama/patches/0035-CUDA-get_rows-q6_k-support.patch
@ -0,0 +1,121 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Fri, 20 Mar 2026 18:50:38 -0700
+Subject: [PATCH] CUDA get_rows q6_k support
+
+---
+ ggml/src/ggml-cuda/getrows.cu   | 80 ++++++++++++++++++++++++++++++++-
+ ggml/src/ggml-cuda/ggml-cuda.cu |  1 +
+ 2 files changed, 80 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
+index 2fab33243..dc5c4f57a 100644
+--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
+@@ -155,6 +155,81 @@ static void get_rows_cuda_float(
+         s10, s11, s12/*, s13*/);
+ }
+ 
+// Specialized GET_ROWS kernel for Q6_K — the k_get_rows template doesn't work for K-quants
+// because they lack the simple dequantize_kernel_t (float2) interface.
+// Based on dequantize_block_q6_K from convert.cu with row-selection logic added.
+template<typename dst_t>
+static __global__ void k_get_rows_q6_K(
+        const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne11, const int64_t ne12,
+        const size_t s1, const size_t s2, const size_t s3,
+        const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t s10, const size_t s11, const size_t s12) {
+
+    const int64_t i10 = blockIdx.x;  // row index into src1
+    const int64_t z   = blockIdx.z;
+    const int64_t i11 = z / ne12;
+    const int64_t i12 = z % ne12;
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const char * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int64_t nb = ne00 / QK_K;  // number of Q6_K blocks per row
+
+    // blockIdx.y iterates over Q6_K blocks within the row
+    for (int64_t iblk = blockIdx.y; iblk < nb; iblk += gridDim.y) {
+        const block_q6_K * x = (const block_q6_K *)src0_row + iblk;
+
+        // Same dequantization as dequantize_block_q6_K (assumes 64 threads)
+        const int64_t tid = threadIdx.x;
+        const int64_t ip  = tid / 32;   // 0 or 1
+        const int64_t il  = tid - 32*ip; // 0..31
+        const int64_t is  = 8*ip + il/16;
+
+        const int64_t y_offset = iblk * QK_K + 128*ip + il;
+
+        const float d = x->d;
+        const uint8_t * ql = x->ql + 64*ip + il;
+        const uint8_t   qh = x->qh[32*ip + il];
+        const int8_t  * sc = x->scales + is;
+
+        if (y_offset + 0  < ne00) dst_row[y_offset +  0] = ggml_cuda_cast<dst_t>(d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32));
+        if (y_offset + 32 < ne00) dst_row[y_offset + 32] = ggml_cuda_cast<dst_t>(d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32));
+        if (y_offset + 64 < ne00) dst_row[y_offset + 64] = ggml_cuda_cast<dst_t>(d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32));
+        if (y_offset + 96 < ne00) dst_row[y_offset + 96] = ggml_cuda_cast<dst_t>(d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32));
+    }
+}
+
+template<typename dst_t>
+static void get_rows_cuda_q6_K(
+        const void * src0_d, const int32_t * src1_d, dst_t * dst_d,
+        const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        cudaStream_t stream) {
+    const int64_t nb_blocks = ne00 / QK_K;
+    const dim3 block_dims(64, 1, 1);
+    const dim3 block_nums(ne10, MIN(nb_blocks, (int64_t)UINT16_MAX), MIN(ne11*ne12, (int64_t)UINT16_MAX));
+
+    const size_t s1 = nb1 / sizeof(dst_t);
+    const size_t s2 = nb2 / sizeof(dst_t);
+    const size_t s3 = nb3 / sizeof(dst_t);
+
+    const size_t s10 = nb10 / sizeof(int32_t);
+    const size_t s11 = nb11 / sizeof(int32_t);
+    const size_t s12 = nb12 / sizeof(int32_t);
+
+    k_get_rows_q6_K<<<block_nums, block_dims, 0, stream>>>(
+        src0_d, src1_d, dst_d,
+        ne00, ne11, ne12,
+        s1, s2, s3,
+        nb01, nb02, nb03,
+        s10, s11, s12);
+}
+
+ template <typename dst_t>
+ static void ggml_cuda_get_rows_switch_src0_type(
+         const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d,
+@@ -199,8 +274,11 @@ static void ggml_cuda_get_rows_switch_src0_type(
+             get_rows_cuda_q<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_d, dst_d,
+                 ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
+             break;
+        case GGML_TYPE_Q6_K:
+            get_rows_cuda_q6_K(src0_d, src1_d, dst_d,
+                ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
+            break;
+         default:
+-            // TODO: k-quants
+             GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type));
+             break;
+     }
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index 5c9dfd032..b8ed3709b 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -4693,6 +4693,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+                     case GGML_TYPE_Q5_0:
+                     case GGML_TYPE_Q5_1:
+                     case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q6_K:
+                         return true;
+                     default:
+                         return false;
--- a/ml/backend.go
+++ b/ml/backend.go
@ -137,6 +137,7 @@ type Tensor interface {

 	Bytes() []byte
 	Floats() []float32
+	BackendGet() []float32

 	FromBytes([]byte)
 	FromFloats([]float32)
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -1069,6 +1069,21 @@ func (t *Tensor) Floats() (data []float32) {
 	return
 }

+func (t *Tensor) BackendGet() []float32 {
+	n := int(C.ggml_nelements(t.t))
+	if n == 0 {
+		return nil
+	}
+
+	if t.sync != nil {
+		t.sync()
+	}
+
+	data := make([]float32, n)
+	C.ggml_backend_tensor_get(t.t, unsafe.Pointer(&data[0]), 0, C.ggml_nbytes(t.t))
+	return data
+}
+
 func tensorSet[S ~[]E, E byte | float32 | int32](t *Tensor, s S) {
 	if len(s) == 0 {
 		return
--- a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
@ -155,6 +155,81 @@ static void get_rows_cuda_float(
        s10, s11, s12/*, s13*/);
 }

+// Specialized GET_ROWS kernel for Q6_K — the k_get_rows template doesn't work for K-quants
+// because they lack the simple dequantize_kernel_t (float2) interface.
+// Based on dequantize_block_q6_K from convert.cu with row-selection logic added.
+template<typename dst_t>
+static __global__ void k_get_rows_q6_K(
+        const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
+        const int64_t ne00,
+        const int64_t ne11, const int64_t ne12,
+        const size_t s1, const size_t s2, const size_t s3,
+        const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t s10, const size_t s11, const size_t s12) {
+
+    const int64_t i10 = blockIdx.x;  // row index into src1
+    const int64_t z   = blockIdx.z;
+    const int64_t i11 = z / ne12;
+    const int64_t i12 = z % ne12;
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const char * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int64_t nb = ne00 / QK_K;  // number of Q6_K blocks per row
+
+    // blockIdx.y iterates over Q6_K blocks within the row
+    for (int64_t iblk = blockIdx.y; iblk < nb; iblk += gridDim.y) {
+        const block_q6_K * x = (const block_q6_K *)src0_row + iblk;
+
+        // Same dequantization as dequantize_block_q6_K (assumes 64 threads)
+        const int64_t tid = threadIdx.x;
+        const int64_t ip  = tid / 32;   // 0 or 1
+        const int64_t il  = tid - 32*ip; // 0..31
+        const int64_t is  = 8*ip + il/16;
+
+        const int64_t y_offset = iblk * QK_K + 128*ip + il;
+
+        const float d = x->d;
+        const uint8_t * ql = x->ql + 64*ip + il;
+        const uint8_t   qh = x->qh[32*ip + il];
+        const int8_t  * sc = x->scales + is;
+
+        if (y_offset + 0  < ne00) dst_row[y_offset +  0] = ggml_cuda_cast<dst_t>(d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32));
+        if (y_offset + 32 < ne00) dst_row[y_offset + 32] = ggml_cuda_cast<dst_t>(d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32));
+        if (y_offset + 64 < ne00) dst_row[y_offset + 64] = ggml_cuda_cast<dst_t>(d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32));
+        if (y_offset + 96 < ne00) dst_row[y_offset + 96] = ggml_cuda_cast<dst_t>(d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32));
+    }
+}
+
+template<typename dst_t>
+static void get_rows_cuda_q6_K(
+        const void * src0_d, const int32_t * src1_d, dst_t * dst_d,
+        const int64_t ne00, const size_t nb01, const size_t nb02, const size_t nb03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        cudaStream_t stream) {
+    const int64_t nb_blocks = ne00 / QK_K;
+    const dim3 block_dims(64, 1, 1);
+    const dim3 block_nums(ne10, MIN(nb_blocks, (int64_t)UINT16_MAX), MIN(ne11*ne12, (int64_t)UINT16_MAX));
+
+    const size_t s1 = nb1 / sizeof(dst_t);
+    const size_t s2 = nb2 / sizeof(dst_t);
+    const size_t s3 = nb3 / sizeof(dst_t);
+
+    const size_t s10 = nb10 / sizeof(int32_t);
+    const size_t s11 = nb11 / sizeof(int32_t);
+    const size_t s12 = nb12 / sizeof(int32_t);
+
+    k_get_rows_q6_K<<<block_nums, block_dims, 0, stream>>>(
+        src0_d, src1_d, dst_d,
+        ne00, ne11, ne12,
+        s1, s2, s3,
+        nb01, nb02, nb03,
+        s10, s11, s12);
+}
+
 template <typename dst_t>
 static void ggml_cuda_get_rows_switch_src0_type(
        const void * src0_d, const ggml_type src0_type, const int32_t * src1_d, dst_t * dst_d,
@ -199,8 +274,11 @@ static void ggml_cuda_get_rows_switch_src0_type(
            get_rows_cuda_q<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_d, dst_d,
                ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
            break;
+        case GGML_TYPE_Q6_K:
+            get_rows_cuda_q6_K(src0_d, src1_d, dst_d,
+                ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
+            break;
        default:
-            // TODO: k-quants
            GGML_ABORT("%s: unsupported src0 type: %s\n", __func__, ggml_type_name(src0_type));
            break;
    }
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@ -4693,6 +4693,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_Q6_K:
                        return true;
                    default:
                        return false;
--- a/model/models/gemma4/model.go
+++ b/model/models/gemma4/model.go
@ -0,0 +1,188 @@
+package gemma4
+
+import (
+	"bytes"
+	"image"
+	"log/slog"
+	"slices"
+	"time"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/tokenizer"
+)
+
+type Model struct {
+	model.Base
+	tokenizer.Tokenizer
+
+	*VisionModel `gguf:"v"`
+	*TextModel
+
+	*MultiModalProjector `gguf:"mm"`
+
+	ImageProcessor
+
+	imageTokenID    int32
+	imageEndTokenID int32
+}
+
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+type MultiModalProjector struct {
+	Projection *ClippableLinear `gguf:"input_projection"`
+}
+
+func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
+	visionOutputs = p.Projection.Forward(ctx, visionOutputs)
+	// Post-projection RMSNorm without learned weight
+	visionOutputs = visionOutputs.RMSNorm(ctx, nil, eps)
+	return visionOutputs
+}
+
+func New(c fs.Config) (model.Model, error) {
+	vocabulary := tokenizer.Vocabulary{
+		Values: c.Strings("tokenizer.ggml.tokens"),
+		Scores: c.Floats("tokenizer.ggml.scores"),
+		Types:  c.Ints("tokenizer.ggml.token_type"),
+		Merges: c.Strings("tokenizer.ggml.merges"),
+		AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+		BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+		AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+		EOS: append(
+			[]int32{
+				int32(c.Uint("tokenizer.ggml.eos_token_id")),
+			},
+			c.Ints("tokenizer.ggml.eos_token_ids")...,
+		),
+	}
+
+	vocabulary.EOS = append(vocabulary.EOS, int32(c.Uint("tokenizer.ggml.eot_token_id", 106)))
+
+	// Gemma 4 uses BPE with SentencePiece-style ▁ space markers (not GPT-2 byte-level encoding).
+	// The tokenizer.json has merges and a Replace normalizer (space → ▁), with no pre-tokenizer.
+	t := tokenizer.NewBytePairEncodingWithOptions(&vocabulary, []string{},
+		tokenizer.WithSentencePieceNormalizer())
+
+	// Look up special token IDs for vision
+	imageTokenID := int32(-1)
+	imageEndTokenID := int32(-1)
+	for i, tok := range vocabulary.Values {
+		switch tok {
+		case "<|image>":
+			imageTokenID = int32(i)
+		case "<image|>":
+			imageEndTokenID = int32(i)
+		}
+	}
+
+	m := Model{
+		Tokenizer:            t,
+		TextModel:            newTextModel(c),
+		VisionModel:          newVisionModel(c),
+		MultiModalProjector:  &MultiModalProjector{},
+		ImageProcessor:       newImageProcessor(c),
+		imageTokenID:         imageTokenID,
+		imageEndTokenID:      imageEndTokenID,
+	}
+
+	slidingWindowLen := int32(c.Uint("attention.sliding_window"))
+	m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
+
+	return &m, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	// Initialize clamp values from model tensors (lazy, once, after model is fully loaded)
+	m.VisionModel.InitClamp(m.MultiModalProjector)
+
+	t0 := time.Now()
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+	slog.Info("vision: decode", "elapsed", time.Since(t0), "bounds", img.Bounds())
+
+	t1 := time.Now()
+	f32s, imgW, imgH, err := m.ImageProcessor.ProcessImage(img)
+	if err != nil {
+		return nil, err
+	}
+	slog.Info("vision: preprocess", "elapsed", time.Since(t1), "size", [2]int{imgW, imgH})
+
+	pixelValues := ctx.Input().FromFloats(f32s, imgW, imgH, m.ImageProcessor.numChannels)
+	slog.Info("vision: pixelValues", "shape", pixelValues.Shape(), "dim0", pixelValues.Dim(0), "dim1", pixelValues.Dim(1), "dim2", pixelValues.Dim(2))
+
+	numPatchesX := imgW / m.ImageProcessor.patchSize
+	numPatchesY := imgH / m.ImageProcessor.patchSize
+	slog.Info("vision: patches", "patchesX", numPatchesX, "patchesY", numPatchesY, "total", numPatchesX*numPatchesY, "patchSize", m.ImageProcessor.patchSize)
+
+	visionOutputs := m.VisionModel.Forward(ctx, pixelValues, numPatchesX, numPatchesY)
+	visionOutputs = visionPoolAndProject(ctx, visionOutputs, numPatchesX, numPatchesY, m.VisionModel.VisionModelOptions, m.MultiModalProjector)
+	slog.Info("vision: encoded", "elapsed", time.Since(t0), "shape", visionOutputs.Shape())
+
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	var result []*input.Input
+
+	for _, inp := range inputs {
+		if len(inp.Multimodal) == 0 {
+			result = append(result, inp)
+		} else {
+			inputMultimodal := inp.Multimodal[0].Tensor
+			numImageTokens := inputMultimodal.Dim(1)
+
+			// <|image>
+			if m.imageTokenID >= 0 {
+				result = append(result, &input.Input{Token: m.imageTokenID, SameBatch: numImageTokens + 2})
+			}
+
+			// Image embedding placeholder tokens
+			result = append(result,
+				&input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash},
+			)
+			result = append(result, slices.Repeat([]*input.Input{{Token: 0}}, numImageTokens-1)...)
+
+			// <image|>
+			if m.imageEndTokenID >= 0 {
+				result = append(result, &input.Input{Token: m.imageEndTokenID})
+			}
+		}
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	hiddenState := m.TextModel.Forward(ctx, batch, m.Cache)
+
+	hiddenState = m.TextModel.Output.Forward(ctx, hiddenState)
+
+	if m.TextModel.TextOptions.finalLogitSoftcap > 0.0 {
+		hiddenState = hiddenState.Scale(ctx, 1.0/float64(m.TextModel.TextOptions.finalLogitSoftcap))
+		hiddenState = hiddenState.Tanh(ctx)
+		hiddenState = hiddenState.Scale(ctx, float64(m.TextModel.TextOptions.finalLogitSoftcap))
+	}
+
+	return hiddenState, nil
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	ropeBase, ropeDims := m.TextModel.ropeForLayer(layer)
+	return nn.RoPE(ctx, key, shift, ropeDims, ropeBase, 1.0, rope.WithTypeNeoX()), nil
+}
+
+func init() {
+	model.Register("gemma4", New)
+}
--- a/model/models/gemma4/model_text.go
+++ b/model/models/gemma4/model_text.go
@ -0,0 +1,454 @@
+package gemma4
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model/input"
+)
+
+const (
+	cacheTypeSWA = iota
+	cacheTypeCausal
+)
+
+type TextOptions struct {
+	hiddenSize              int
+	numHeads, numKVHeads    int
+	numGlobalKVHeads        int
+	headDim, globalHeadDim  int
+	hiddenLayers            int
+	hiddenSizePerLayerInput int
+
+	eps               float32
+	ropeBase          float32
+	ropeLocalBase     float32
+	partialRotaryDims int // RoPE dims for full-attention (global) layers
+
+	slidingWindowPattern []bool
+	// kvDonorMap maps shared layer index -> donor layer index.
+	// Donor is the last non-shared layer of the same type (sliding/full).
+	kvDonorMap map[int]int
+
+	finalLogitSoftcap float32
+
+	numExperts     int
+	numExpertsUsed int
+}
+
+func (o *TextOptions) isLocal(layer int) bool {
+	if layer < len(o.slidingWindowPattern) {
+		return o.slidingWindowPattern[layer]
+	}
+	return false
+}
+
+func (o *TextOptions) ropeForLayer(layer int) (base float32, dims int) {
+	if o.isLocal(layer) {
+		return o.ropeLocalBase, o.headDim
+	}
+	return o.ropeBase, o.partialRotaryDims
+}
+
+func (o *TextOptions) kvHeadsForLayer(layer int) int {
+	if o.isLocal(layer) {
+		return o.numKVHeads
+	}
+	if o.numGlobalKVHeads > 0 {
+		return o.numGlobalKVHeads
+	}
+	return o.numKVHeads
+}
+
+func (o *TextOptions) headDimForLayer(layer int) int {
+	if o.isLocal(layer) {
+		return o.headDim
+	}
+	return o.globalHeadDim
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	*PerLayerProjector
+	Layers     []TextLayer `gguf:"blk"`
+	OutputNorm *nn.RMSNorm `gguf:"output_norm"`
+	Output     *nn.Linear  `gguf:"output,alt:token_embd"`
+	TextOptions
+}
+
+func newTextModel(c fs.Config) *TextModel {
+	numLayers := int(c.Uint("block_count"))
+
+	// Head dimensions: key_length is global head dim, key_length_swa is local (SWA) head dim.
+	globalHeadDim := int(c.Uint("attention.key_length", 512))
+	headDim := int(c.Uint("attention.key_length_swa", 256))
+
+	// RoPE dimensions for global (full attention) layers with proportional RoPE.
+	// The freq_factors tensor handles partial rotation (1.0 for rotated pairs,
+	// 1e30 for non-rotated), so ropeDims equals the full global head dim.
+	partialRotaryDims := int(c.Uint("rope.dimension_count", 0))
+	if partialRotaryDims == 0 {
+		partialFactor := c.Float("rope.partial_rotary_factor", 1.0)
+		partialRotaryDims = int(float32(globalHeadDim) * partialFactor)
+	}
+
+	ropeBase := c.Float("rope.freq_base", 1000000.0)
+	ropeLocalBase := c.Float("rope.freq_base_swa", 0)
+	if ropeLocalBase == 0 {
+		ropeLocalBase = c.Float("rope.local.freq_base", 10000.0)
+	}
+
+	numGlobalKVHeads := int(c.Uint("attention.global_head_count_kv", 0))
+	slidingPattern := c.Bools("attention.sliding_window_pattern")
+
+	// KV heads: try per-layer array first (MoE models), then fall back to scalar
+	numKVHeads := 0
+	kvHeadsArray := c.Ints("attention.head_count_kv")
+	if len(kvHeadsArray) > 0 {
+		numKVHeads = int(kvHeadsArray[0])
+		if numGlobalKVHeads == 0 && len(slidingPattern) > 0 {
+			for i, isLocal := range slidingPattern {
+				if !isLocal && i < len(kvHeadsArray) {
+					numGlobalKVHeads = int(kvHeadsArray[i])
+					break
+				}
+			}
+		}
+	}
+	if numKVHeads == 0 {
+		numKVHeads = int(c.Uint("attention.head_count_kv", 0))
+	}
+
+	// Compute KV sharing donor map (same logic as MLX)
+	sharedLayers := int(c.Uint("attention.shared_kv_layers", 0))
+	kvDonorMap := make(map[int]int)
+	if sharedLayers > 0 && len(slidingPattern) > 0 {
+		firstShared := numLayers - sharedLayers
+		for i := firstShared; i < numLayers; i++ {
+			isLocal := slidingPattern[i]
+			// Find last non-shared layer of same type
+			for j := firstShared - 1; j >= 0; j-- {
+				if slidingPattern[j] == isLocal {
+					kvDonorMap[i] = j
+					break
+				}
+			}
+		}
+	}
+
+	return &TextModel{
+		Layers: make([]TextLayer, numLayers),
+		TextOptions: TextOptions{
+			hiddenSize:              int(c.Uint("embedding_length")),
+			numHeads:                int(c.Uint("attention.head_count")),
+			numKVHeads:              numKVHeads,
+			numGlobalKVHeads:        numGlobalKVHeads,
+			headDim:                 headDim,
+			globalHeadDim:           globalHeadDim,
+			hiddenLayers:            numLayers,
+			hiddenSizePerLayerInput: int(c.Uint("embedding_length_per_layer_input", 0)),
+			eps:                     c.Float("attention.layer_norm_rms_epsilon", 1e-06),
+			ropeBase:                ropeBase,
+			ropeLocalBase:           ropeLocalBase,
+			partialRotaryDims:       partialRotaryDims,
+			slidingWindowPattern:    slidingPattern,
+			kvDonorMap:              kvDonorMap,
+			finalLogitSoftcap:       c.Float("final_logit_softcapping", 0.0),
+			numExperts:              int(c.Uint("expert_count", 0)),
+			numExpertsUsed:          int(c.Uint("expert_used_count", 0)),
+		},
+	}
+}
+
+func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) ml.Tensor {
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.hiddenSize)))
+
+	// Inject vision embeddings into the hidden state
+	var except []int
+	for _, image := range batch.Multimodal {
+		visionOutputs := image.Multimodal[0].Tensor
+		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+
+		for i := range visionOutputs.Dim(1) {
+			except = append(except, image.Index+i)
+		}
+	}
+
+	// PLE
+	var perLayerInputs ml.Tensor
+	if m.PerLayerProjector != nil {
+		perLayerInputs = m.PerLayerProjector.Forward(ctx, batch, hiddenState, &m.TextOptions)
+	}
+
+	for i := range len(m.Layers) {
+		layer := m.Layers[i]
+		if cache != nil {
+			cache.SetLayer(i)
+			cacheType := cacheTypeSWA
+			if !m.isLocal(i) {
+				cacheType = cacheTypeCausal
+			}
+			wc := cache.(*kvcache.WrapperCache)
+			wc.SetLayerType(cacheType)
+
+			if causal, ok := wc.UnderlyingCache().(*kvcache.Causal); ok {
+				causal.SetCausal(ctx, kvcache.CausalOptions{Except: except})
+			}
+		}
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = batch.Outputs
+		}
+
+		var perLayerInput ml.Tensor
+		if perLayerInputs != nil {
+			perLayerInput = perLayerInputs.View(ctx, i*perLayerInputs.Stride(1), perLayerInputs.Dim(0), perLayerInputs.Stride(2), perLayerInputs.Dim(2))
+		}
+
+		// KV sharing: layers >= firstShared reuse K/V from donor layers
+		isShared := false
+		if donorLayer, ok := m.kvDonorMap[i]; ok {
+			// Set cache layer to donor so Get() reads donor's K/V
+			cache.SetLayer(donorLayer)
+			isShared = true
+		}
+		hiddenState = layer.Forward(ctx, i, hiddenState, positions, perLayerInput, lastLayerOutputs, cache, isShared, &m.TextOptions)
+	}
+
+	return m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+}
+
+// PerLayerProjector implements PLE.
+type PerLayerProjector struct {
+	TokenEmbedding *nn.Embedding `gguf:"per_layer_token_embd"`
+	Projector      *nn.Linear    `gguf:"per_layer_model_proj"`
+	Norm           *nn.RMSNorm   `gguf:"per_layer_proj_norm"`
+}
+
+func (p *PerLayerProjector) Forward(ctx ml.Context, batch input.Batch, inputs ml.Tensor, opts *TextOptions) ml.Tensor {
+	inputsPerLayer := p.TokenEmbedding.Forward(ctx, batch.Inputs)
+	inputsPerLayer = inputsPerLayer.Scale(ctx, math.Sqrt(float64(opts.hiddenSizePerLayerInput)))
+	// Reshape to [pleDim, numLayers, numTokens] — matching projection shape
+	inputsPerLayer = inputsPerLayer.Reshape(ctx, opts.hiddenSizePerLayerInput, opts.hiddenLayers, inputs.Dim(1))
+
+	perLayerProjection := p.Projector.Forward(ctx, inputs)
+	perLayerProjection = perLayerProjection.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize)))
+	perLayerProjection = perLayerProjection.Reshape(ctx, opts.hiddenSizePerLayerInput, opts.hiddenLayers, inputs.Dim(1))
+	perLayerProjection = p.Norm.Forward(ctx, perLayerProjection, opts.eps)
+
+	if inputsPerLayer != nil {
+		perLayerProjection = perLayerProjection.Add(ctx, inputsPerLayer)
+		perLayerProjection = perLayerProjection.Scale(ctx, 1/math.Sqrt(2))
+	}
+
+	return perLayerProjection
+}
+
+type TextSelfAttention struct {
+	Query       *nn.Linear  `gguf:"attn_q"`
+	QueryNorm   *nn.RMSNorm `gguf:"attn_q_norm"`
+	Key         *nn.Linear  `gguf:"attn_k"`
+	KeyNorm     *nn.RMSNorm `gguf:"attn_k_norm"`
+	Value       *nn.Linear  `gguf:"attn_v"`
+	Output      *nn.Linear  `gguf:"attn_output"`
+	RopeFactors ml.Tensor   `gguf:"rope_freqs.weight"` // proportional RoPE freq_factors
+}
+
+func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positions ml.Tensor, cache kvcache.Cache, sharedKV bool, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	hd := opts.headDimForLayer(layer)
+	kvHeads := opts.kvHeadsForLayer(layer)
+	ropeBase, ropeDims := opts.ropeForLayer(layer)
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, hd, opts.numHeads, batchSize)
+	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
+
+	var k, v ml.Tensor
+	if !sharedKV {
+		k = sa.Key.Forward(ctx, hiddenState)
+		k = k.Reshape(ctx, hd, kvHeads, batchSize)
+
+		if sa.Value != nil {
+			v = sa.Value.Forward(ctx, hiddenState)
+			v = v.Reshape(ctx, hd, kvHeads, batchSize)
+		} else {
+			// K=V: use raw K projection (before K norm) as V
+			v = k
+		}
+
+		k = sa.KeyNorm.Forward(ctx, k, opts.eps)
+		v = v.RMSNorm(ctx, nil, opts.eps) // V norm: unweighted RMSNorm
+	}
+
+	// RoPE with proportional freq_factors on global layers
+	ropeOpts := []func(*rope.Options){rope.WithTypeNeoX()}
+	if sa.RopeFactors != nil && !opts.isLocal(layer) {
+		ropeOpts = append(ropeOpts, rope.WithFactors(sa.RopeFactors))
+	}
+	q = nn.RoPE(ctx, q, positions, ropeDims, ropeBase, 1.0, ropeOpts...)
+	if k != nil {
+		k = nn.RoPE(ctx, k, positions, ropeDims, ropeBase, 1.0, ropeOpts...)
+	}
+
+	attention := nn.Attention(ctx, q, k, v, 1.0, cache)
+
+	attention = attention.Reshape(ctx, hd*opts.numHeads, batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type TextMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).GELU(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+// TextRouter implements the Gemma 4 MoE router.
+// It does: RMSNorm(no weight) → scale(1/sqrt(hidden)) → multiply by scale param → linear → softmax → topk
+type TextRouter struct {
+	Proj  *nn.Linear `gguf:"ffn_gate_inp"`
+	Scale ml.Tensor  `gguf:"ffn_gate_inp.scale"`
+}
+
+func (r *TextRouter) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) (routingWeights, selectedExperts ml.Tensor) {
+	// RMSNorm without learned weight
+	x := hiddenState.RMSNorm(ctx, nil, opts.eps)
+	// Scale by 1/sqrt(hidden_size)
+	x = x.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize)))
+	// Multiply by learned scale parameter
+	x = x.Mul(ctx, r.Scale)
+	// Project to expert logits
+	expertScores := r.Proj.Forward(ctx, x)
+	// Softmax over experts
+	routingWeights = expertScores.Softmax(ctx)
+	// TopK expert selection
+	selectedExperts = routingWeights.TopK(ctx, opts.numExpertsUsed)
+	return routingWeights, selectedExperts
+}
+
+// TextMoEBlock implements the Gemma 4 sparse MoE.
+type TextMoEBlock struct {
+	Gate *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up   *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down *nn.LinearBatch `gguf:"ffn_down_exps"`
+}
+
+func (moe *TextMoEBlock) Forward(ctx ml.Context, hiddenState, routingWeights, selectedExperts ml.Tensor, opts *TextOptions) ml.Tensor {
+	// Select routing weights for chosen experts and renormalize
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenState.Dim(1)).Rows(ctx, selectedExperts)
+	routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenState.Dim(1))
+	routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenState.Dim(1))
+
+	hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), 1, hiddenState.Dim(1))
+
+	// Expert computation using LinearBatch (MulmatID selecting experts by index)
+	gateOut := moe.Gate.Forward(ctx, hiddenState, selectedExperts)
+	upOut := moe.Up.Forward(ctx, hiddenState, selectedExperts)
+	hiddenState = gateOut.GELU(ctx, upOut)
+	experts := moe.Down.Forward(ctx, hiddenState, selectedExperts)
+
+	// Apply routing weights
+	experts = experts.Mul(ctx, routingWeights)
+
+	// Sum across experts
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates
+}
+
+type TextLayer struct {
+	AttentionNorm     *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention     *TextSelfAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"post_attention_norm,alt:attn_post_norm"`
+	MLPNorm           *nn.RMSNorm `gguf:"ffn_norm,alt:ffn_pre_norm"`
+	MLP               *TextMLP
+	PostMLPNorm       *nn.RMSNorm `gguf:"post_ffw_norm,alt:ffn_post_norm"`
+
+	// MoE (present only for models with enable_moe_block=true)
+	Router       *TextRouter
+	MoE          *TextMoEBlock
+	MoENorm      *nn.RMSNorm `gguf:"pre_ffw_norm_2,alt:ffn_pre_norm_2"`
+	PostMoENorm  *nn.RMSNorm `gguf:"post_ffw_norm_2,alt:ffn_post_norm_2"`
+	PostMLPNorm1 *nn.RMSNorm `gguf:"post_ffw_norm_1,alt:ffn_post_norm_1"` // used instead of PostMLPNorm when MoE is present
+
+	PerLayerInputGate  *nn.Linear  `gguf:"inp_gate"`
+	PerLayerProjection *nn.Linear  `gguf:"proj"`
+	PostPerLayerNorm   *nn.RMSNorm `gguf:"post_norm"`
+	LayerScalar        ml.Tensor   `gguf:"layer_scalar,alt:layer_output_scale.weight"`
+}
+
+func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positions, perLayerInput, outputs ml.Tensor, cache kvcache.Cache, sharedKV bool, opts *TextOptions) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, layer, hiddenState, positions, cache, sharedKV, opts)
+	hiddenState = l.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
+
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+		if perLayerInput != nil {
+			perLayerInput = perLayerInput.Rows(ctx, outputs)
+		}
+	}
+
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	// MLP (+ optional MoE in parallel)
+	if l.Router != nil && l.MoE != nil && l.MoE.Gate != nil && l.MoE.Gate.Weight != nil {
+		// MoE layers: run MLP and MoE in parallel, sum results
+		mlpState := l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+		mlpState = l.MLP.Forward(ctx, mlpState)
+		mlpState = l.PostMLPNorm1.Forward(ctx, mlpState, opts.eps)
+
+		routingWeights, selectedExperts := l.Router.Forward(ctx, hiddenState, opts)
+		moeState := l.MoENorm.Forward(ctx, hiddenState, opts.eps)
+		moeState = l.MoE.Forward(ctx, moeState, routingWeights, selectedExperts, opts)
+		moeState = l.PostMoENorm.Forward(ctx, moeState, opts.eps)
+
+		// Combine MLP + MoE, apply outer post-FFN norm, then add residual
+		combined := mlpState.Add(ctx, moeState)
+		combined = l.PostMLPNorm.Forward(ctx, combined, opts.eps)
+		hiddenState = combined.Add(ctx, residual)
+	} else {
+		// Dense layers: MLP only
+		hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+		hiddenState = l.MLP.Forward(ctx, hiddenState)
+		hiddenState = l.PostMLPNorm.Forward(ctx, hiddenState, opts.eps)
+		hiddenState = hiddenState.Add(ctx, residual)
+	}
+
+	// PLE injection (after MLP residual)
+	if perLayerInput != nil && l.PerLayerInputGate != nil {
+		pleState := l.PerLayerInputGate.Forward(ctx, hiddenState)
+		pleState = pleState.GELU(ctx, perLayerInput)
+		pleState = l.PerLayerProjection.Forward(ctx, pleState)
+		pleState = l.PostPerLayerNorm.Forward(ctx, pleState, opts.eps)
+		hiddenState = hiddenState.Add(ctx, pleState)
+	}
+
+	// Layer scalar applied at end of layer (full-attention layers only)
+	if l.LayerScalar != nil {
+		hiddenState = hiddenState.Mul(ctx, l.LayerScalar)
+	}
+
+	return hiddenState
+}
--- a/model/models/gemma4/model_vision.go
+++ b/model/models/gemma4/model_vision.go
@ -0,0 +1,304 @@
+package gemma4
+
+import (
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+const batchSize = 1
+
+// ClippableLinear is a linear layer with optional input/output clamping.
+// Required by Gemma4 vision encoder for numerical stability with F16 weights.
+// Clamp values are populated by VisionModel.InitClamp from the packed v.clamp_data tensor.
+type ClippableLinear struct {
+	Weight ml.Tensor `gguf:"weight"`
+
+	inMin, inMax, outMin, outMax float32
+}
+
+func (l *ClippableLinear) Forward(ctx ml.Context, x ml.Tensor) ml.Tensor {
+	if l.inMax != 0 {
+		x = x.Clamp(ctx, l.inMin, l.inMax)
+	}
+	out := l.Weight.Mulmat(ctx, x)
+	if l.outMax != 0 {
+		out = out.Clamp(ctx, l.outMin, l.outMax)
+	}
+	return out
+}
+
+// InitClamp distributes packed clamp values from v.clamp_data to ClippableLinear structs.
+// Layout: numLayers × 7 linears (q,k,v,out,gate,up,down) × 4 floats (inMin,inMax,outMin,outMax)
+// then 4 floats for the projector.
+func (m *VisionModel) InitClamp(proj *MultiModalProjector) {
+	if m.clampInitDone || m.ClampData == nil {
+		return
+	}
+	m.clampInitDone = true
+
+	// Read all clamp values from packed F32 tensor
+	data := m.ClampData.BackendGet()
+	if len(data) == 0 {
+		return
+	}
+
+	// Distribute to layer linears: 7 per layer × 4 values each
+	linears := func(l *VisionEncoderLayer) []*ClippableLinear {
+		return []*ClippableLinear{
+			l.SelfAttention.Query, l.SelfAttention.Key, l.SelfAttention.Value,
+			l.SelfAttention.Output, l.MLP.Gate, l.MLP.Up, l.MLP.Down,
+		}
+	}
+	for i := range m.Layers {
+		for li, cl := range linears(&m.Layers[i]) {
+			idx := (i*7 + li) * 4
+			if idx+3 < len(data) {
+				cl.inMin = data[idx]
+				cl.inMax = data[idx+1]
+				cl.outMin = data[idx+2]
+				cl.outMax = data[idx+3]
+			}
+		}
+	}
+
+	// Projector clamp values (last 4 floats)
+	if proj != nil {
+		projIdx := len(m.Layers) * 7 * 4
+		if projIdx+3 < len(data) {
+			proj.Projection.inMin = data[projIdx]
+			proj.Projection.inMax = data[projIdx+1]
+			proj.Projection.outMin = data[projIdx+2]
+			proj.Projection.outMax = data[projIdx+3]
+		}
+	}
+}
+
+type VisionSelfAttention struct {
+	Query     *ClippableLinear `gguf:"attn_q"`
+	Key       *ClippableLinear `gguf:"attn_k"`
+	Value     *ClippableLinear `gguf:"attn_v"`
+	QueryNorm *nn.RMSNorm      `gguf:"attn_q_norm"`
+	KeyNorm   *nn.RMSNorm      `gguf:"attn_k_norm"`
+	Output    *ClippableLinear `gguf:"attn_out"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState, posX, posY, attnMask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	numPatches := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	key := sa.Key.Forward(ctx, hiddenState)
+	value := sa.Value.Forward(ctx, hiddenState)
+
+	query = query.Reshape(ctx, headDim, opts.numHeads, numPatches, batchSize)
+	key = key.Reshape(ctx, headDim, opts.numHeads, numPatches, batchSize)
+	value = value.Reshape(ctx, headDim, opts.numHeads, numPatches, batchSize)
+
+	// Q/K norms (Gemma-style: x * (1 + weight) / rms(x))
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+
+	// V norm (RMSNorm without learned weights)
+	value = value.RMSNorm(ctx, nil, opts.eps)
+
+	// 2D RoPE: split head dim in half, apply NeoX RoPE with x positions to first half,
+	// y positions to second half, then concatenate.
+	halfDim := headDim / 2
+	ropeOpts := rope.WithTypeNeoX()
+
+	qFirst := query.View(ctx, 0, halfDim, query.Stride(1), opts.numHeads, query.Stride(2), numPatches)
+	qFirst = nn.RoPE(ctx, qFirst, posX, halfDim, opts.ropeTheta, 1.0, ropeOpts)
+
+	kFirst := key.View(ctx, 0, halfDim, key.Stride(1), opts.numHeads, key.Stride(2), numPatches)
+	kFirst = nn.RoPE(ctx, kFirst, posX, halfDim, opts.ropeTheta, 1.0, ropeOpts)
+
+	halfOffset := halfDim * query.Stride(0)
+	qSecond := query.View(ctx, halfOffset, halfDim, query.Stride(1), opts.numHeads, query.Stride(2), numPatches)
+	qSecond = nn.RoPE(ctx, qSecond, posY, halfDim, opts.ropeTheta, 1.0, ropeOpts)
+
+	halfOffsetK := halfDim * key.Stride(0)
+	kSecond := key.View(ctx, halfOffsetK, halfDim, key.Stride(1), opts.numHeads, key.Stride(2), numPatches)
+	kSecond = nn.RoPE(ctx, kSecond, posY, halfDim, opts.ropeTheta, 1.0, ropeOpts)
+
+	query = qFirst.Concat(ctx, qSecond, 0)
+	key = kFirst.Concat(ctx, kSecond, 0)
+
+	// Use flash attention for numerical stability (handles large attention scores
+	// from unclamped RMSNorm weights, e.g. 26B has addOne weights up to 19.5)
+	attention := nn.Attention(ctx, query, key, value, 1.0, nil)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+type VisionMLP struct {
+	Gate *ClippableLinear `gguf:"ffn_gate"`
+	Up   *ClippableLinear `gguf:"ffn_up"`
+	Down *ClippableLinear `gguf:"ffn_down"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
+	gate := mlp.Gate.Forward(ctx, hiddenState)
+	up := mlp.Up.Forward(ctx, hiddenState)
+	hiddenState = gate.QuickGELU(ctx, up)
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type VisionEncoderLayer struct {
+	AttentionNorm     *nn.RMSNorm `gguf:"ln1"`
+	SelfAttention     *VisionSelfAttention
+	PostAttentionNorm *nn.RMSNorm `gguf:"attn_post_norm"`
+
+	FFNNorm     *nn.RMSNorm `gguf:"ln2"`
+	MLP         *VisionMLP
+	PostFFNNorm *nn.RMSNorm `gguf:"ffn_post_norm"`
+
+	LayerOutputScale ml.Tensor `gguf:"out_scale.weight"`
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, posX, posY, attnMask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	// Pre-attention norm -> self attention -> post-attention norm
+	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, posX, posY, attnMask, opts)
+	hiddenState = e.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps)
+
+	// Residual connection
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	// Pre-FFN norm -> FFN -> post-FFN norm
+	hiddenState = e.FFNNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.MLP.Forward(ctx, hiddenState)
+	hiddenState = e.PostFFNNorm.Forward(ctx, hiddenState, opts.eps)
+
+	// Residual connection
+	hiddenState = hiddenState.Add(ctx, residual)
+
+	// Per-layer output scale
+	if e.LayerOutputScale != nil {
+		hiddenState = hiddenState.Mul(ctx, e.LayerOutputScale)
+	}
+
+	return hiddenState
+}
+
+type VisionModelOptions struct {
+	hiddenSize int
+	numHeads   int
+	patchSize  int
+	nMerge     int
+	eps        float32
+	ropeTheta  float32
+}
+
+type VisionModel struct {
+	PatchEmbedding    *nn.Conv2D `gguf:"patch_embd"`
+	PositionEmbedding ml.Tensor  `gguf:"position_embd.weight"`
+	ClampData         ml.Tensor  `gguf:"clamp_data"`
+
+	Layers []VisionEncoderLayer `gguf:"blk"`
+
+	*VisionModelOptions
+	clampInitDone bool
+}
+
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, numPatchesX, numPatchesY int) ml.Tensor {
+	numPatches := numPatchesX * numPatchesY
+
+	// Patch embedding via Conv2D
+	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	// Conv2D with F16 weights produces F16 output via im2col; cast to F32 for encoder precision
+	hiddenState = hiddenState.Cast(ctx, ml.DTypeF32)
+
+	// 2D positional embeddings from 3D tensor [nEmbd, maxPos, 2]
+	posSize := m.PositionEmbedding.Dim(1)
+	nb1 := m.PositionEmbedding.Stride(1)
+	tblX := m.PositionEmbedding.View(ctx, 0, m.hiddenSize, nb1, posSize)
+	tblY := m.PositionEmbedding.View(ctx, posSize*nb1, m.hiddenSize, nb1, posSize)
+
+	// Position indices for patches
+	posXData := make([]int32, numPatches)
+	posYData := make([]int32, numPatches)
+	for i := range numPatches {
+		posXData[i] = int32(i % numPatchesX)
+		posYData[i] = int32(i / numPatchesX)
+	}
+
+	posXEmb := ctx.Input().FromInts(posXData, numPatches)
+	posYEmb := ctx.Input().FromInts(posYData, numPatches)
+
+	hiddenState = hiddenState.Add(ctx, tblX.Rows(ctx, posXEmb))
+	hiddenState = hiddenState.Add(ctx, tblY.Rows(ctx, posYEmb))
+
+	// No attention mask — all positions are real patches
+	var attnMask ml.Tensor
+
+	// RoPE positions
+	posXRope := ctx.Input().FromInts(posXData, numPatches)
+	posYRope := ctx.Input().FromInts(posYData, numPatches)
+
+	// Vision transformer layers
+	for i := range m.Layers {
+		hiddenState = m.Layers[i].Forward(ctx, hiddenState, posXRope, posYRope, attnMask, m.VisionModelOptions)
+	}
+
+	return hiddenState
+}
+
+func newVisionModel(c fs.Config) *VisionModel {
+	return &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize: int(c.Uint("vision.embedding_length")),
+			numHeads:   int(c.Uint("vision.attention.head_count")),
+			patchSize:  int(c.Uint("vision.patch_size", 16)),
+			nMerge:     int(c.Uint("vision.projector.scale_factor", 3)),
+			eps:        c.Float("vision.attention.layer_norm_epsilon", 1e-6),
+			ropeTheta:  100.0,
+		},
+	}
+}
+
+func visionTokenCount(imageWidth, imageHeight, patchSize, nMerge int) int {
+	patchesX := imageWidth / patchSize
+	patchesY := imageHeight / patchSize
+	mergedX := patchesX / nMerge
+	mergedY := patchesY / nMerge
+	return mergedX * mergedY
+}
+
+func visionPoolAndProject(ctx ml.Context, hiddenState ml.Tensor, numPatchesX, numPatchesY int, opts *VisionModelOptions, proj *MultiModalProjector) ml.Tensor {
+	hiddenSize := opts.hiddenSize
+
+	// Reshape from [hiddenSize, numPatches] to spatial layout for pooling
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+	hiddenState = hiddenState.Reshape(ctx, numPatchesX, numPatchesY, hiddenSize)
+
+	// AvgPool2D with kernel=stride=nMerge
+	hiddenState = hiddenState.AvgPool2D(ctx, opts.nMerge, opts.nMerge, 0)
+
+	// Reshape back to [hiddenSize, numMergedPatches]
+	mergedX := numPatchesX / opts.nMerge
+	mergedY := numPatchesY / opts.nMerge
+	hiddenState = hiddenState.Reshape(ctx, mergedX*mergedY, hiddenSize)
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	// Ensure F32 for the projection Mulmat. The Metal mul_mm kernel for F16×F32
+	// casts F32 activations to F16 in shared memory, so values must stay within
+	// F16 range (≤65504). The sqrt(hiddenSize) scaling from the HF reference is
+	// omitted because it's normalized out by the unweighted RMSNorm that follows
+	// the projection: RMSNorm(√d·x) = x/rms(x) = RMSNorm(x).
+	hiddenState = hiddenState.Cast(ctx, ml.DTypeF32)
+
+	// Project to text embedding dimension
+	hiddenState = proj.Forward(ctx, hiddenState, opts.eps)
+
+	return hiddenState
+}
--- a/model/models/gemma4/process_image.go
+++ b/model/models/gemma4/process_image.go
@ -0,0 +1,103 @@
+package gemma4
+
+import (
+	"image"
+	"math"
+
+	"golang.org/x/image/draw"
+
+	"github.com/ollama/ollama/fs"
+)
+
+type ImageProcessor struct {
+	patchSize   int
+	numChannels int
+	nMerge      int
+	minPixels   int
+	maxPixels   int
+}
+
+func newImageProcessor(c fs.Config) ImageProcessor {
+	patchSize := int(c.Uint("vision.patch_size", 16))
+	nMerge := int(c.Uint("vision.projector.scale_factor", 3))
+	numChannels := int(c.Uint("vision.num_channels", 3))
+
+	// Token limits from reference: min=40, max=280 output tokens after pooling.
+	// Convert to pixel counts: tokens * nMerge^2 * patchSize^2
+	minTokens := 40
+	maxTokens := 280
+	patchArea := patchSize * patchSize * nMerge * nMerge
+	minPixels := minTokens * patchArea
+	maxPixels := maxTokens * patchArea
+
+	return ImageProcessor{
+		patchSize:   patchSize,
+		numChannels: numChannels,
+		nMerge:      nMerge,
+		minPixels:   minPixels,
+		maxPixels:   maxPixels,
+	}
+}
+
+// ProcessImage resizes an image preserving aspect ratio, aligning dimensions
+// to (patchSize * nMerge) boundaries, and normalizes pixels to [-1, 1].
+// Returns the float32 pixel data and the actual output dimensions.
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, int, int, error) {
+	// Compute target size preserving aspect ratio
+	alignSize := p.patchSize * p.nMerge
+	targetW, targetH := p.smartResize(img.Bounds().Dx(), img.Bounds().Dy(), alignSize)
+
+	// Resize directly without alpha compositing, matching MLX reference.
+	dst := image.NewRGBA(image.Rect(0, 0, targetW, targetH))
+	draw.BiLinear.Scale(dst, dst.Bounds(), img, img.Bounds(), draw.Over, nil)
+
+	// Normalize to [-1, 1] using mean=0.5, std=0.5: (pixel/255 - 0.5) / 0.5 = 2*pixel/255 - 1
+	data := p.pack(dst)
+	return data, targetW, targetH, nil
+}
+
+// smartResize computes target dimensions that preserve aspect ratio and
+// align to alignSize boundaries. It scales the image to fill the maximum
+// patch budget (maxPixels), matching the MLX reference.
+func (p *ImageProcessor) smartResize(origW, origH, alignSize int) (int, int) {
+	totalPx := origW * origH
+
+	var targetW, targetH int
+	if p.maxPixels > 0 && totalPx > 0 {
+		factor := math.Sqrt(float64(p.maxPixels) / float64(totalPx))
+		targetH = max(alignSize, int(math.Floor(factor*float64(origH)/float64(alignSize)))*alignSize)
+		targetW = max(alignSize, int(math.Floor(factor*float64(origW)/float64(alignSize)))*alignSize)
+	} else {
+		targetH = max(alignSize, (origH/alignSize)*alignSize)
+		targetW = max(alignSize, (origW/alignSize)*alignSize)
+	}
+
+	return targetW, targetH
+}
+
+// pack extracts RGB values from an image and normalizes to [-1, 1].
+// Returns channel-first layout: [R..., G..., B...].
+func (p *ImageProcessor) pack(img image.Image) []float32 {
+	bounds := img.Bounds()
+	w := bounds.Dx()
+	h := bounds.Dy()
+	size := w * h
+
+	pixelVals := make([]float32, 3*size)
+	rOff, gOff, bOff := 0, size, 2*size
+
+	for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+		for x := bounds.Min.X; x < bounds.Max.X; x++ {
+			c := img.At(x, y)
+			r, g, b, _ := c.RGBA()
+			idx := (y-bounds.Min.Y)*w + (x - bounds.Min.X)
+
+			// Normalize [0, 255] -> [-1, 1]: 2 * (val/255) - 1
+			pixelVals[rOff+idx] = float32(r>>8)/255.0*2.0 - 1.0
+			pixelVals[gOff+idx] = float32(g>>8)/255.0*2.0 - 1.0
+			pixelVals[bOff+idx] = float32(b>>8)/255.0*2.0 - 1.0
+		}
+	}
+
+	return pixelVals
+}
--- a/model/models/gemma4/tokenizer_compare_test.go
+++ b/model/models/gemma4/tokenizer_compare_test.go
@ -0,0 +1,102 @@
+package gemma4
+
+import (
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/tokenizer"
+)
+
+// TestTokenizerMatchesHF compares our tokenizer output against HuggingFace reference tokens.
+func TestTokenizerMatchesHF(t *testing.T) {
+	modelPath := os.Getenv("GEMMA4_MODEL_PATH")
+	if modelPath == "" {
+		t.Skip("set GEMMA4_MODEL_PATH to a gemma4 GGUF file")
+	}
+
+	m, err := model.New(modelPath, ml.BackendParams{AllocMemory: true})
+	if err != nil {
+		t.Fatalf("Failed to load model: %v", err)
+	}
+	defer m.Backend().Close()
+
+	tok := m.(tokenizer.Tokenizer)
+
+	tests := []struct {
+		name     string
+		input    string
+		expected []int32
+	}{
+		{
+			name:     "simple",
+			input:    "Hello, world!",
+			expected: []int32{9259, 236764, 1902, 236888},
+		},
+		{
+			name:     "special_tokens",
+			input:    "<|turn>user\nWhat is 2+2?<turn|>\n<|turn>model\n",
+			expected: []int32{105, 2364, 107, 3689, 563, 236743, 236778, 236862, 236778, 236881, 106, 107, 105, 4368, 107},
+		},
+		{
+			name:     "tool_declaration",
+			input:    "<|tool>declaration:bash{description:<|\"|>Run a command<|\"|>}<tool|>",
+			expected: []int32{46, 163688, 236787, 42422, 236782, 7777, 236787, 52, 7306, 496, 4991, 52, 236783, 47},
+		},
+		{
+			name:     "tool_call",
+			input:    "<|tool_call>call:bash{command:<|\"|>ls -la<|\"|>}<tool_call|>",
+			expected: []int32{48, 6639, 236787, 42422, 236782, 7674, 236787, 52, 5629, 753, 2149, 52, 236783, 49},
+		},
+		{
+			name:     "thinking",
+			input:    "<|channel>thought\nLet me think about this...<channel|>The answer is 42.",
+			expected: []int32{100, 45518, 107, 6481, 786, 1751, 1003, 672, 1390, 101, 818, 3890, 563, 236743, 236812, 236778, 236761},
+		},
+		{
+			name:     "code",
+			input:    "func main() { fmt.Println(\"hello\") }",
+			expected: []int32{6823, 1689, 825, 642, 22766, 236761, 29006, 885, 23391, 1373, 682},
+		},
+		{
+			name:     "numbers",
+			input:    "The answer is 42, not 43.5 or -1",
+			expected: []int32{818, 3890, 563, 236743, 236812, 236778, 236764, 711, 236743, 236812, 236800, 236761, 236810, 653, 753, 236770},
+		},
+		{
+			name:     "mixed_chat_with_tools",
+			input:    "<|turn>system\nYou are a helpful assistant.\n<|tool>declaration:get_weather{description:<|\"|>Get weather<|\"|>,parameters:{properties:{city:{type:<|\"|>STRING<|\"|>}},type:<|\"|>OBJECT<|\"|>}}<tool|><turn|>\n<|turn>user\nWhat's the weather in Paris?<turn|>\n<|turn>model\n<|channel>thought\n<channel|>",
+			expected: []int32{105, 9731, 107, 3048, 659, 496, 11045, 16326, 236761, 107, 46, 163688, 236787, 828, 236779, 19323, 236782, 7777, 236787, 52, 3407, 7606, 52, 236764, 19031, 29616, 15921, 29616, 13319, 29616, 2084, 236787, 52, 35410, 52, 5237, 2084, 236787, 52, 60688, 52, 1807, 47, 106, 107, 105, 2364, 107, 3689, 236789, 236751, 506, 7606, 528, 9079, 236881, 106, 107, 105, 4368, 107, 100, 45518, 107, 101},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tokens, err := tok.Encode(tt.input, false) // no BOS
+			if err != nil {
+				t.Fatalf("encode error: %v", err)
+			}
+
+			if len(tokens) != len(tt.expected) {
+				t.Errorf("token count mismatch: got %d, want %d", len(tokens), len(tt.expected))
+				t.Logf("got:  %v", tokens)
+				t.Logf("want: %v", tt.expected)
+				return
+			}
+
+			mismatches := 0
+			for i := range tokens {
+				if tokens[i] != tt.expected[i] {
+					mismatches++
+					if mismatches <= 5 {
+						t.Errorf("mismatch at [%d]: got %d, want %d", i, tokens[i], tt.expected[i])
+					}
+				}
+			}
+			if mismatches > 5 {
+				t.Errorf("... and %d more mismatches", mismatches-5)
+			}
+		})
+	}
+}
--- a/model/models/models.go
+++ b/model/models/models.go
@ -7,6 +7,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
+	_ "github.com/ollama/ollama/model/models/gemma4"
 	_ "github.com/ollama/ollama/model/models/glm4moelite"
 	_ "github.com/ollama/ollama/model/models/glmocr"
 	_ "github.com/ollama/ollama/model/models/gptoss"
--- a/model/renderers/gemma4.go
+++ b/model/renderers/gemma4.go
@ -0,0 +1,353 @@
+package renderers
+
+import (
+	"encoding/json"
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/ollama/ollama/api"
+)
+
+// Gemma4Renderer renders prompts using Gemma 4's chat format with
+// <|turn>/<turn|> markers, <|"|> string delimiters, and <|tool>/
+// <|tool_call>/<|tool_response> tags for function calling.
+type Gemma4Renderer struct {
+	useImgTags bool
+}
+
+const (
+	g4Q = `<|"|>` // Gemma 4 string delimiter
+)
+
+func (r *Gemma4Renderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
+	var sb strings.Builder
+	imageOffset := 0
+
+	// BOS token — Gemma 4 models have add_bos_token=false in their tokenizer
+	// config, so the tokenizer does not auto-prepend BOS. We must emit it
+	// explicitly in the rendered prompt, matching the HF chat template.
+	sb.WriteString("<bos>")
+	// Extract system message if present.
+	var systemMessage string
+	var loopMessages []api.Message
+	if len(messages) > 0 && (messages[0].Role == "system" || messages[0].Role == "developer") {
+		systemMessage = messages[0].Content
+		loopMessages = messages[1:]
+	} else {
+		loopMessages = messages
+	}
+
+	// Emit system turn if there's a system message, tools, or thinking.
+	hasThink := thinkValue != nil && thinkValue.Bool()
+	if systemMessage != "" || len(tools) > 0 || hasThink {
+		sb.WriteString("<|turn>system\n")
+		if hasThink {
+			sb.WriteString("<|think|>")
+		}
+		if systemMessage != "" {
+			sb.WriteString(systemMessage)
+		}
+		for _, tool := range tools {
+			sb.WriteString(r.renderToolDeclaration(tool))
+		}
+		sb.WriteString("<turn|>\n")
+	}
+
+	// inModelTurn tracks whether we're inside an open <|turn>model block.
+	// Tool responses are appended inline (no separate turn), and the model
+	// turn is only closed when we see a non-tool message or reach the end.
+	inModelTurn := false
+
+	for i, message := range loopMessages {
+		switch message.Role {
+		case "user":
+			if inModelTurn {
+				// Check if the preceding content was a tool response (no <turn|>
+				// between tool response and next user turn per HF reference).
+				prevIsToolResponse := i > 0 && loopMessages[i-1].Role == "tool"
+				if !prevIsToolResponse {
+					sb.WriteString("<turn|>\n")
+				}
+				inModelTurn = false
+			}
+			sb.WriteString("<|turn>user\n")
+			r.renderContent(&sb, message, &imageOffset)
+			sb.WriteString("<turn|>\n")
+
+		case "assistant":
+			if inModelTurn {
+				sb.WriteString("<turn|>\n")
+			}
+			sb.WriteString("<|turn>model\n")
+			inModelTurn = true
+			if message.Content != "" {
+				sb.WriteString(message.Content)
+			}
+			for _, tc := range message.ToolCalls {
+				sb.WriteString(r.formatToolCall(tc))
+			}
+
+		case "tool":
+			// Tool responses are rendered inline in the preceding model turn,
+			// matching the reference format from HuggingFace's chat template.
+			// Format: <|tool_response>response:NAME{key:value,...}<tool_response|>
+			toolName := r.findToolName(loopMessages, i)
+			sb.WriteString("<|tool_response>response:" + toolName + "{")
+			r.renderToolResponseContent(&sb, message.Content)
+			sb.WriteString("}<tool_response|>")
+			// Keep the model turn open — it will be closed when we see the
+			// next non-tool message or the assistant adds content after the response.
+
+		default:
+			if inModelTurn {
+				sb.WriteString("<turn|>\n")
+				inModelTurn = false
+			}
+			sb.WriteString("<|turn>" + message.Role + "\n")
+			sb.WriteString(message.Content)
+			sb.WriteString("<turn|>\n")
+		}
+	}
+
+	// If the last message is not an open assistant turn, add the generation prompt.
+	if !inModelTurn {
+		sb.WriteString("<|turn>model\n")
+	}
+
+	return sb.String(), nil
+}
+
+// renderContent writes a message's content, interleaving [img-N] tags for images.
+func (r *Gemma4Renderer) renderContent(sb *strings.Builder, msg api.Message, imageOffset *int) {
+	if len(msg.Images) > 0 && r.useImgTags {
+		for range msg.Images {
+			sb.WriteString(fmt.Sprintf("[img-%d]", *imageOffset))
+			*imageOffset++
+		}
+	}
+	sb.WriteString(msg.Content)
+}
+
+func (r *Gemma4Renderer) renderToolDeclaration(tool api.Tool) string {
+	var sb strings.Builder
+	fn := tool.Function
+
+	sb.WriteString("<|tool>declaration:" + fn.Name + "{")
+	sb.WriteString("description:" + g4Q + fn.Description + g4Q)
+
+	if fn.Parameters.Properties != nil || fn.Parameters.Type != "" {
+		sb.WriteString(",parameters:{")
+
+		needsComma := false
+
+		if fn.Parameters.Properties != nil && fn.Parameters.Properties.Len() > 0 {
+			sb.WriteString("properties:{")
+			r.writeProperties(&sb, fn.Parameters.Properties)
+			sb.WriteString("}")
+			needsComma = true
+		}
+
+		if len(fn.Parameters.Required) > 0 {
+			if needsComma {
+				sb.WriteString(",")
+			}
+			sb.WriteString("required:[")
+			for i, req := range fn.Parameters.Required {
+				if i > 0 {
+					sb.WriteString(",")
+				}
+				sb.WriteString(g4Q + req + g4Q)
+			}
+			sb.WriteString("]")
+			needsComma = true
+		}
+
+		if fn.Parameters.Type != "" {
+			if needsComma {
+				sb.WriteString(",")
+			}
+			sb.WriteString("type:" + g4Q + strings.ToUpper(fn.Parameters.Type) + g4Q)
+		}
+
+		sb.WriteString("}")
+	}
+
+	sb.WriteString("}<tool|>")
+	return sb.String()
+}
+
+func (r *Gemma4Renderer) writeProperties(sb *strings.Builder, props *api.ToolPropertiesMap) {
+	keys := make([]string, 0, props.Len())
+	for k := range props.All() {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	first := true
+	for _, name := range keys {
+		prop, _ := props.Get(name)
+		if !first {
+			sb.WriteString(",")
+		}
+		first = false
+
+		sb.WriteString(name + ":{")
+		if prop.Description != "" {
+			sb.WriteString("description:" + g4Q + prop.Description + g4Q)
+		}
+		if len(prop.Enum) > 0 {
+			if prop.Description != "" {
+				sb.WriteString(",")
+			}
+			sb.WriteString("enum:[")
+			for j, e := range prop.Enum {
+				if j > 0 {
+					sb.WriteString(",")
+				}
+				sb.WriteString(g4Q + fmt.Sprintf("%v", e) + g4Q)
+			}
+			sb.WriteString("]")
+		}
+		if len(prop.Type) > 0 {
+			if prop.Description != "" || len(prop.Enum) > 0 {
+				sb.WriteString(",")
+			}
+			sb.WriteString("type:" + g4Q + strings.ToUpper(prop.Type[0]) + g4Q)
+		}
+		sb.WriteString("}")
+	}
+}
+
+func (r *Gemma4Renderer) formatToolCall(tc api.ToolCall) string {
+	var sb strings.Builder
+	sb.WriteString("<|tool_call>call:" + tc.Function.Name + "{")
+
+	keys := make([]string, 0, tc.Function.Arguments.Len())
+	for k := range tc.Function.Arguments.All() {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	first := true
+	for _, key := range keys {
+		value, _ := tc.Function.Arguments.Get(key)
+		if !first {
+			sb.WriteString(",")
+		}
+		first = false
+		sb.WriteString(key + ":" + r.formatArgValue(value))
+	}
+
+	sb.WriteString("}<tool_call|>")
+	return sb.String()
+}
+
+func (r *Gemma4Renderer) formatArgValue(value any) string {
+	switch v := value.(type) {
+	case string:
+		return g4Q + v + g4Q
+	case bool:
+		if v {
+			return "true"
+		}
+		return "false"
+	case float64:
+		if v == float64(int64(v)) {
+			return fmt.Sprintf("%d", int64(v))
+		}
+		return fmt.Sprintf("%v", v)
+	case int, int64, int32:
+		return fmt.Sprintf("%d", v)
+	case map[string]any:
+		return r.formatMapValue(v)
+	case []any:
+		return r.formatArrayValue(v)
+	default:
+		return fmt.Sprintf("%v", v)
+	}
+}
+
+func (r *Gemma4Renderer) formatMapValue(m map[string]any) string {
+	var sb strings.Builder
+	sb.WriteString("{")
+
+	keys := make([]string, 0, len(m))
+	for k := range m {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	first := true
+	for _, key := range keys {
+		if !first {
+			sb.WriteString(",")
+		}
+		first = false
+		sb.WriteString(key + ":" + r.formatArgValue(m[key]))
+	}
+
+	sb.WriteString("}")
+	return sb.String()
+}
+
+func (r *Gemma4Renderer) formatArrayValue(arr []any) string {
+	var sb strings.Builder
+	sb.WriteString("[")
+	for i, item := range arr {
+		if i > 0 {
+			sb.WriteString(",")
+		}
+		sb.WriteString(r.formatArgValue(item))
+	}
+	sb.WriteString("]")
+	return sb.String()
+}
+
+// renderToolResponseContent renders tool response content in Gemma 4 format.
+// If the content is valid JSON, it renders each field as key:value pairs with
+// proper type formatting (strings get <|"|> delimiters, numbers/bools are bare).
+// If not valid JSON, wraps the entire content as a single "value" string.
+func (r *Gemma4Renderer) renderToolResponseContent(sb *strings.Builder, content string) {
+	// Try to parse as JSON object.
+	var obj map[string]any
+	if err := json.Unmarshal([]byte(content), &obj); err == nil {
+		keys := make([]string, 0, len(obj))
+		for k := range obj {
+			keys = append(keys, k)
+		}
+		sort.Strings(keys)
+
+		first := true
+		for _, key := range keys {
+			if !first {
+				sb.WriteString(",")
+			}
+			first = false
+			sb.WriteString(key + ":" + r.formatArgValue(obj[key]))
+		}
+		return
+	}
+
+	// Not JSON — wrap as a single string value.
+	sb.WriteString("value:" + g4Q + content + g4Q)
+}
+
+// findToolName walks backwards from tool message index to find the matching tool call name.
+func (r *Gemma4Renderer) findToolName(messages []api.Message, toolIdx int) string {
+	for j := toolIdx - 1; j >= 0; j-- {
+		if messages[j].Role == "assistant" && len(messages[j].ToolCalls) > 0 {
+			toolOffset := 0
+			for k := j + 1; k < toolIdx; k++ {
+				if messages[k].Role == "tool" {
+					toolOffset++
+				}
+			}
+			if toolOffset < len(messages[j].ToolCalls) {
+				return messages[j].ToolCalls[toolOffset].Function.Name
+			}
+			break
+		}
+	}
+	return ""
+}
--- a/server/create.go
+++ b/server/create.go
@ -509,6 +509,24 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
 			config.ModelType = cmp.Or(config.ModelType, format.HumanNumber(layer.GGML.KV().ParameterCount()))
 			config.FileType = cmp.Or(config.FileType, layer.GGML.KV().FileType().String())
 			config.ModelFamilies = append(config.ModelFamilies, layer.GGML.KV().Architecture())
+
+			// Auto-detect renderer, parser, and stop tokens from GGUF architecture.
+			// TODO: abstract this into a registry/lookup table when multiple models
+			// need architecture-based renderer/parser/stop defaults.
+			if config.Renderer == "" || config.Parser == "" {
+				arch := layer.GGML.KV().Architecture()
+				switch arch {
+				case "gemma4":
+					config.Renderer = cmp.Or(config.Renderer, "gemma4")
+					config.Parser = cmp.Or(config.Parser, "gemma4")
+					if _, ok := r.Parameters["stop"]; !ok {
+						if r.Parameters == nil {
+							r.Parameters = make(map[string]any)
+						}
+						r.Parameters["stop"] = []string{"<turn|>"}
+					}
+				}
+			}
 		}
 		layers = append(layers, layer.Layer)
 	}
--- a/server/quantization.go
+++ b/server/quantization.go
@ -153,7 +153,16 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 		// MLA tensors need higher precision to avoid quality degradation
 		newType = fsggml.TensorTypeQ8_0
 	} else if strings.Contains(name, "ffn_down") {
-		iLayer := qs.iFfnDown
+		// For MoE models, ffn_down.weight (dense) and ffn_down_exps.weight (expert) both
+		// exist per layer and should get the same useMoreBits treatment. Dense sorts before
+		// expert alphabetically, so dense increments the counter and expert uses counter-1.
+		var iLayer int
+		if strings.Contains(name, "_exps") {
+			iLayer = max(0, qs.iFfnDown-1)
+		} else {
+			iLayer = qs.iFfnDown
+			qs.iFfnDown++
+		}
 		n_layer := qs.nFfnDown
 		if ftype == fsggml.FileTypeQ4_K_M {
 			if useMoreBits(iLayer, n_layer) {
@ -162,7 +171,6 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 		} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
 			newType = fsggml.TensorTypeQ5_K
 		}
-		qs.iFfnDown++
 	} else if strings.Contains(name, "attn_output.weight") {
 		if nExperts == 8 {
 			if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {