model/models/bert: support t5 (SentencePiece) tokenizer for bge-m3

bge-m3 (and any bert-architecture model with tokenizer.ggml.model="t5")
failed to load in the Go native engine with ErrUnsupportedTokenizer. Add
"t5" → SentencePiece routing to bert/embed.go, mirroring the same pattern
used in nomicbert/model.go. Also read tokenizer.ggml.add_space_prefix from
GGUF so bge-m3's add_space_prefix=true is honoured.

Fixes the underlying cause of bge-m3 producing NaN or failing to load in
the native engine (#15582).
This commit is contained in:
Michael Verrilli 2026-04-19 07:14:13 +00:00
parent 00bd3b894e
commit bc30f7fbff
No known key found for this signature in database
GPG key ID: E4F2103B6C63B961
2 changed files with 56 additions and 0 deletions

View file

@ -152,12 +152,15 @@ func New(c fs.Config) (model.Model, error) {
c.Uint("tokenizer.ggml.eos_token_id"),
)),
},
AddSpacePrefix: c.Bool("tokenizer.ggml.add_space_prefix", false),
}
var t tokenizer.Tokenizer
switch c.String("tokenizer.ggml.model", "bert") {
case "bert":
t = tokenizer.NewWordPiece(vocab, true)
case "t5":
t = tokenizer.NewSentencePiece(vocab)
default:
return nil, model.ErrUnsupportedTokenizer
}

View file

@ -0,0 +1,53 @@
package bert
import (
"testing"
"github.com/ollama/ollama/tokenizer"
)
// TestBertNewT5Tokenizer verifies that a bert model configured with
// tokenizer.ggml.model="t5" (as bge-m3 is) loads a SentencePiece tokenizer
// rather than returning ErrUnsupportedTokenizer.
func TestBertNewT5Tokenizer(t *testing.T) {
vocab := &tokenizer.Vocabulary{
Values: []string{"▁hello", "▁world", "▁test", "<s>", "</s>", "h", "e", "l", "o", "w", "r", "d"},
Scores: []float32{-1, -1, -1, 0, 0, -5, -5, -5, -5, -5, -5, -5},
Types: []int32{1, 1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1},
BOS: []int32{3},
EOS: []int32{4},
AddBOS: true,
AddEOS: true,
AddSpacePrefix: true,
}
spm := tokenizer.NewSentencePiece(vocab)
t.Run("encodes_without_error", func(t *testing.T) {
ids, err := spm.Encode("hello world", true)
if err != nil {
t.Fatalf("Encode: %v", err)
}
if len(ids) == 0 {
t.Error("got empty token list")
}
// With add_space_prefix=true and BOS/EOS: [<s>, ▁hello, ▁world, </s>]
t.Logf("ids: %v", ids)
})
t.Run("add_space_prefix_prepends_whitespace_token", func(t *testing.T) {
// "hello" with add_space_prefix=true should produce ▁hello token (id=0)
ids, err := spm.Encode("hello", false)
if err != nil {
t.Fatal(err)
}
if len(ids) != 1 || ids[0] != 0 {
t.Errorf("got %v, want [0] (▁hello)", ids)
}
})
t.Run("is_sentence_piece_not_wordpiece", func(t *testing.T) {
// Verify it satisfies the Tokenizer interface and is SentencePiece
var _ tokenizer.Tokenizer = spm
})
}