package gemma4 import ( "math" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn/rope" ) const batchSize = 1 // ClippableLinear is a linear layer with optional input/output clamping. // Required by Gemma4 vision encoder for numerical stability with F16 weights. type ClippableLinear struct { Weight ml.Tensor `gguf:"weight"` InputMin ml.Tensor `gguf:"input_min"` InputMax ml.Tensor `gguf:"input_max"` OutputMin ml.Tensor `gguf:"output_min"` OutputMax ml.Tensor `gguf:"output_max"` inMin, inMax, outMin, outMax float32 hasClamp bool clampsLoaded bool } func scalarValue(t ml.Tensor) (float32, bool) { if t == nil { return 0, false } data := t.BackendGet() if len(data) == 0 { return 0, false } return data[0], true } func (l *ClippableLinear) loadClampFromScalars() { if l.clampsLoaded { return } l.clampsLoaded = true const ( defaultMin = -math.MaxFloat32 defaultMax = math.MaxFloat32 ) inMin, hasInMin := scalarValue(l.InputMin) inMax, hasInMax := scalarValue(l.InputMax) outMin, hasOutMin := scalarValue(l.OutputMin) outMax, hasOutMax := scalarValue(l.OutputMax) if !(hasInMin || hasInMax || hasOutMin || hasOutMax) { return } l.hasClamp = true l.inMin = defaultMin l.inMax = defaultMax l.outMin = defaultMin l.outMax = defaultMax if hasInMin { l.inMin = inMin } if hasInMax { l.inMax = inMax } if hasOutMin { l.outMin = outMin } if hasOutMax { l.outMax = outMax } } func (l *ClippableLinear) Forward(ctx ml.Context, x ml.Tensor) ml.Tensor { if l.hasClamp { x = x.Clamp(ctx, l.inMin, l.inMax) } out := l.Weight.Mulmat(ctx, x) if l.hasClamp { out = out.Clamp(ctx, l.outMin, l.outMax) } return out } // InitClamp distributes packed clamp values from v.clamp_data to ClippableLinear structs. // If scalar clamp tensors (input_min/max, output_min/max) are present, they are used too. // Layout: numLayers × 7 linears (q,k,v,out,gate,up,down) × 4 floats (inMin,inMax,outMin,outMax) // then 4 floats for the projector. func (m *VisionModel) InitClamp(proj *MultiModalProjector) { if m.clampInitDone { return } m.clampInitDone = true linears := func(l *VisionEncoderLayer) []*ClippableLinear { return []*ClippableLinear{ l.SelfAttention.Query, l.SelfAttention.Key, l.SelfAttention.Value, l.SelfAttention.Output, l.MLP.Gate, l.MLP.Up, l.MLP.Down, } } for i := range m.Layers { for _, cl := range linears(&m.Layers[i]) { if cl != nil { cl.loadClampFromScalars() } } } if proj != nil && proj.Projection != nil { proj.Projection.loadClampFromScalars() } // Load packed clamp data when present (legacy Ollama format). if m.ClampData == nil { return } // Read all clamp values from packed F32 tensor data := m.ClampData.BackendGet() if len(data) == 0 { return } // Distribute to layer linears: 7 per layer × 4 values each for i := range m.Layers { for li, cl := range linears(&m.Layers[i]) { if cl == nil { continue } idx := (i*7 + li) * 4 if idx+3 < len(data) { cl.inMin = data[idx] cl.inMax = data[idx+1] cl.outMin = data[idx+2] cl.outMax = data[idx+3] cl.hasClamp = true } } } // Projector clamp values (last 4 floats) if proj != nil && proj.Projection != nil { projIdx := len(m.Layers) * 7 * 4 if projIdx+3 < len(data) { proj.Projection.inMin = data[projIdx] proj.Projection.inMax = data[projIdx+1] proj.Projection.outMin = data[projIdx+2] proj.Projection.outMax = data[projIdx+3] proj.Projection.hasClamp = true } } } type VisionSelfAttention struct { Query *ClippableLinear `gguf:"attn_q"` Key *ClippableLinear `gguf:"attn_k"` Value *ClippableLinear `gguf:"attn_v"` QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"` KeyNorm *nn.RMSNorm `gguf:"attn_k_norm"` Output *ClippableLinear `gguf:"attn_out"` } func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState, posX, posY, attnMask ml.Tensor, opts *VisionModelOptions) ml.Tensor { numPatches := hiddenState.Dim(1) headDim := opts.hiddenSize / opts.numHeads query := sa.Query.Forward(ctx, hiddenState) key := sa.Key.Forward(ctx, hiddenState) value := sa.Value.Forward(ctx, hiddenState) query = query.Reshape(ctx, headDim, opts.numHeads, numPatches, batchSize) key = key.Reshape(ctx, headDim, opts.numHeads, numPatches, batchSize) value = value.Reshape(ctx, headDim, opts.numHeads, numPatches, batchSize) // Q/K norms (Gemma-style: x * (1 + weight) / rms(x)) query = sa.QueryNorm.Forward(ctx, query, opts.eps) key = sa.KeyNorm.Forward(ctx, key, opts.eps) // V norm (RMSNorm without learned weights) value = value.RMSNorm(ctx, nil, opts.eps) // 2D RoPE: split head dim in half, apply NeoX RoPE with x positions to first half, // y positions to second half, then concatenate. halfDim := headDim / 2 ropeOpts := rope.WithTypeNeoX() qFirst := query.View(ctx, 0, halfDim, query.Stride(1), opts.numHeads, query.Stride(2), numPatches) qFirst = nn.RoPE(ctx, qFirst, posX, halfDim, opts.ropeTheta, 1.0, ropeOpts) kFirst := key.View(ctx, 0, halfDim, key.Stride(1), opts.numHeads, key.Stride(2), numPatches) kFirst = nn.RoPE(ctx, kFirst, posX, halfDim, opts.ropeTheta, 1.0, ropeOpts) halfOffset := halfDim * query.Stride(0) qSecond := query.View(ctx, halfOffset, halfDim, query.Stride(1), opts.numHeads, query.Stride(2), numPatches) qSecond = nn.RoPE(ctx, qSecond, posY, halfDim, opts.ropeTheta, 1.0, ropeOpts) halfOffsetK := halfDim * key.Stride(0) kSecond := key.View(ctx, halfOffsetK, halfDim, key.Stride(1), opts.numHeads, key.Stride(2), numPatches) kSecond = nn.RoPE(ctx, kSecond, posY, halfDim, opts.ropeTheta, 1.0, ropeOpts) query = qFirst.Concat(ctx, qSecond, 0) key = kFirst.Concat(ctx, kSecond, 0) // Use flash attention for numerical stability (handles large attention scores // from unclamped RMSNorm weights, e.g. 26B has addOne weights up to 19.5) attention := nn.Attention(ctx, query, key, value, 1.0, nil) attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) return sa.Output.Forward(ctx, attention) } type VisionMLP struct { Gate *ClippableLinear `gguf:"ffn_gate"` Up *ClippableLinear `gguf:"ffn_up"` Down *ClippableLinear `gguf:"ffn_down"` } func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor { gate := mlp.Gate.Forward(ctx, hiddenState) up := mlp.Up.Forward(ctx, hiddenState) hiddenState = gate.QuickGELU(ctx, up) return mlp.Down.Forward(ctx, hiddenState) } type VisionEncoderLayer struct { AttentionNorm *nn.RMSNorm `gguf:"ln1"` SelfAttention *VisionSelfAttention PostAttentionNorm *nn.RMSNorm `gguf:"attn_post_norm"` FFNNorm *nn.RMSNorm `gguf:"ln2"` MLP *VisionMLP PostFFNNorm *nn.RMSNorm `gguf:"ffn_post_norm"` LayerOutputScale ml.Tensor `gguf:"out_scale.weight"` } func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, posX, posY, attnMask ml.Tensor, opts *VisionModelOptions) ml.Tensor { residual := hiddenState // Pre-attention norm -> self attention -> post-attention norm hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.SelfAttention.Forward(ctx, hiddenState, posX, posY, attnMask, opts) hiddenState = e.PostAttentionNorm.Forward(ctx, hiddenState, opts.eps) // Residual connection hiddenState = hiddenState.Add(ctx, residual) residual = hiddenState // Pre-FFN norm -> FFN -> post-FFN norm hiddenState = e.FFNNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.MLP.Forward(ctx, hiddenState) hiddenState = e.PostFFNNorm.Forward(ctx, hiddenState, opts.eps) // Residual connection hiddenState = hiddenState.Add(ctx, residual) // Per-layer output scale if e.LayerOutputScale != nil { hiddenState = hiddenState.Mul(ctx, e.LayerOutputScale) } return hiddenState } type VisionModelOptions struct { hiddenSize int numHeads int patchSize int nMerge int eps float32 ropeTheta float32 } type VisionModel struct { PatchEmbedding *nn.Conv2D `gguf:"patch_embd"` PositionEmbedding ml.Tensor `gguf:"position_embd.weight"` ClampData ml.Tensor `gguf:"clamp_data"` StdBias ml.Tensor `gguf:"std_bias"` StdScale ml.Tensor `gguf:"std_scale"` Layers []VisionEncoderLayer `gguf:"blk"` *VisionModelOptions clampInitDone bool } func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, numPatchesX, numPatchesY int) ml.Tensor { numPatches := numPatchesX * numPatchesY // Patch embedding via Conv2D hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1) hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) // Conv2D with F16 weights produces F16 output via im2col; cast to F32 for encoder precision hiddenState = hiddenState.Cast(ctx, ml.DTypeF32) // 2D positional embeddings from 3D tensor [nEmbd, maxPos, 2] posSize := m.PositionEmbedding.Dim(1) nb1 := m.PositionEmbedding.Stride(1) tblX := m.PositionEmbedding.View(ctx, 0, m.hiddenSize, nb1, posSize) tblY := m.PositionEmbedding.View(ctx, posSize*nb1, m.hiddenSize, nb1, posSize) // Position indices for patches posXData := make([]int32, numPatches) posYData := make([]int32, numPatches) for i := range numPatches { posXData[i] = int32(i % numPatchesX) posYData[i] = int32(i / numPatchesX) } posXEmb := ctx.Input().FromInts(posXData, numPatches) posYEmb := ctx.Input().FromInts(posYData, numPatches) hiddenState = hiddenState.Add(ctx, tblX.Rows(ctx, posXEmb)) hiddenState = hiddenState.Add(ctx, tblY.Rows(ctx, posYEmb)) // No attention mask — all positions are real patches var attnMask ml.Tensor // RoPE positions posXRope := ctx.Input().FromInts(posXData, numPatches) posYRope := ctx.Input().FromInts(posYData, numPatches) // Vision transformer layers for i := range m.Layers { hiddenState = m.Layers[i].Forward(ctx, hiddenState, posXRope, posYRope, attnMask, m.VisionModelOptions) } return hiddenState } func newVisionModel(c fs.Config) *VisionModel { return &VisionModel{ Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")), VisionModelOptions: &VisionModelOptions{ hiddenSize: int(c.Uint("vision.embedding_length")), numHeads: int(c.Uint("vision.attention.head_count")), patchSize: int(c.Uint("vision.patch_size", 16)), nMerge: int(c.Uint("vision.projector.scale_factor", 3)), eps: c.Float("vision.attention.layer_norm_epsilon", 1e-6), ropeTheta: 100.0, }, } } func visionPoolAndProject(ctx ml.Context, hiddenState ml.Tensor, numPatchesX, numPatchesY int, opts *VisionModelOptions, proj *MultiModalProjector, stdBias, stdScale ml.Tensor) ml.Tensor { hiddenSize := opts.hiddenSize // Reshape from [hiddenSize, numPatches] to spatial layout for pooling hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) hiddenState = hiddenState.Reshape(ctx, numPatchesX, numPatchesY, hiddenSize) // AvgPool2D with kernel=stride=nMerge hiddenState = hiddenState.AvgPool2D(ctx, opts.nMerge, opts.nMerge, 0) // Reshape back to [hiddenSize, numMergedPatches] mergedX := numPatchesX / opts.nMerge mergedY := numPatchesY / opts.nMerge hiddenState = hiddenState.Reshape(ctx, mergedX*mergedY, hiddenSize) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) hiddenState = hiddenState.Cast(ctx, ml.DTypeF32) hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(hiddenSize))) // Optional vision standardization before projection. if stdBias != nil && stdScale != nil { hiddenState = hiddenState.Sub(ctx, stdBias) hiddenState = hiddenState.Mul(ctx, stdScale) } // Project to text embedding dimension hiddenState = proj.Forward(ctx, hiddenState, opts.eps) return hiddenState }