This commit is contained in:
Daniel Hiltgen 2026-04-23 13:47:20 +10:00 committed by GitHub
commit 40c588f65c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 884 additions and 33 deletions

View file

@ -387,6 +387,10 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
}
func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error {
for k, v := range sourceTensorKV(ts) {
kv[k] = v
}
for i := range ts {
ts[i].Shape = slices.Clone(ts[i].Shape)
slices.Reverse(ts[i].Shape)

View file

@ -5,10 +5,12 @@ import (
"bytes"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
"maps"
"math"
"slices"
"strings"
@ -23,6 +25,11 @@ type safetensorMetadata struct {
}
func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
fp8Block, err := safetensorsFP8BlockSize(fsys)
if err != nil {
return nil, err
}
var ts []Tensor
for _, p := range ps {
f, err := fsys.Open(p)
@ -50,24 +57,47 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
names := make(map[string]struct{}, len(keys))
fp8Scales, err := collectSafetensorsFP8Scales(n, headers)
if err != nil {
return nil, err
}
for _, key := range keys {
if value := headers[key]; value.Type != "" {
if _, ok := fp8Scales.consumed[key]; ok {
continue
}
// Scalar tensors (e.g. clipped linear min/max) are 0-dim in safetensors.
// Promote them to 1-dim so they can be stored in GGUF.
if len(value.Shape) == 0 {
value.Shape = []uint64{1}
}
var scale *safetensorScale
if value.Type == "F8_E4M3" {
if !fp8Block.ok {
return nil, fmt.Errorf("missing fp8 block size metadata for tensor %q", key)
}
scale = fp8Scales.byWeight[key]
if scale == nil {
return nil, fmt.Errorf("missing fp8 scale companion for tensor %q", key)
}
}
ggufName := replacer.Replace(key)
if _, ok := names[ggufName]; ok {
return nil, fmt.Errorf("duplicate tensor name '%s' was found for this model", ggufName)
}
names[ggufName] = struct{}{}
ts = append(ts, safetensor{
fs: fsys,
path: p,
dtype: value.Type,
offset: safetensorsPad(n, value.Offsets[0]),
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
fs: fsys,
path: p,
dtype: value.Type,
offset: safetensorsPad(n, value.Offsets[0]),
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
scale: scale,
fp8Block: fp8Block,
tensorBase: &tensorBase{
name: ggufName,
shape: value.Shape,
@ -85,12 +115,22 @@ func safetensorsPad(n, offset int64) int64 {
return 8 + n + offset
}
type safetensor struct {
fs fs.FS
path string
type safetensorScale struct {
name string
dtype string
shape []uint64
offset int64
size int64
}
type safetensor struct {
fs fs.FS
path string
dtype string
offset int64
size int64
scale *safetensorScale
fp8Block safetensorFP8BlockSize
*tensorBase
}
@ -104,17 +144,26 @@ func (st safetensor) Kind() uint32 {
kind != tensorKindFP32 {
kind = tensorKindBF16
}
if st.dtype == "F8_E4M3" && kind != tensorKindFP32 {
kind = tensorKindBF16
}
return kind
}
func (st safetensor) SourceDType() string {
return st.dtype
}
func (st safetensor) Clone() Tensor {
return &safetensor{
fs: st.fs,
path: st.path,
dtype: st.dtype,
offset: st.offset,
size: st.size,
fs: st.fs,
path: st.path,
dtype: st.dtype,
offset: st.offset,
size: st.size,
scale: st.scale.Clone(),
fp8Block: st.fp8Block,
tensorBase: &tensorBase{
name: st.name,
repacker: st.repacker,
@ -123,6 +172,19 @@ func (st safetensor) Clone() Tensor {
}
}
func (ss *safetensorScale) Clone() *safetensorScale {
if ss == nil {
return nil
}
return &safetensorScale{
name: ss.name,
dtype: ss.dtype,
shape: slices.Clone(ss.shape),
offset: ss.offset,
size: ss.size,
}
}
func (st safetensor) WriteTo(w io.Writer) (int64, error) {
f, err := st.fs.Open(st.path)
if err != nil {
@ -180,6 +242,16 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
}
f32s = bfloat16.DecodeFloat32(u8s)
case "F8_E4M3":
u8s := make([]uint8, st.size)
if err = binary.Read(br, binary.LittleEndian, u8s); err != nil {
return 0, err
}
f32s, err = st.decodeFP8E4M3(u8s)
if err != nil {
return 0, err
}
default:
return 0, fmt.Errorf("unknown data type: %s", st.dtype)
}
@ -208,3 +280,334 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
}
}
type safetensorsFP8Scales struct {
byWeight map[string]*safetensorScale
consumed map[string]struct{}
}
func collectSafetensorsFP8Scales(n int64, headers map[string]safetensorMetadata) (safetensorsFP8Scales, error) {
scales := safetensorsFP8Scales{
byWeight: make(map[string]*safetensorScale),
consumed: make(map[string]struct{}),
}
for key, value := range headers {
if value.Type != "F8_E4M3" {
continue
}
scaleKey, scaleValue, ok, err := safetensorsFP8Scale(key, headers)
if err != nil {
return safetensorsFP8Scales{}, err
}
if !ok {
continue
}
if _, ok := scales.consumed[scaleKey]; ok {
return safetensorsFP8Scales{}, fmt.Errorf("fp8 scale companion %q is used by multiple tensors", scaleKey)
}
scales.byWeight[key] = &safetensorScale{
name: scaleKey,
dtype: scaleValue.Type,
shape: slices.Clone(scaleValue.Shape),
offset: safetensorsPad(n, scaleValue.Offsets[0]),
size: safetensorsPad(n, scaleValue.Offsets[1]) - safetensorsPad(n, scaleValue.Offsets[0]),
}
scales.consumed[scaleKey] = struct{}{}
}
return scales, nil
}
func safetensorsFP8Scale(key string, headers map[string]safetensorMetadata) (string, safetensorMetadata, bool, error) {
candidates := safetensorsFP8ScaleCandidates(key)
var scaleKey string
var scaleValue safetensorMetadata
if strings.HasSuffix(key, ".weight") {
// Keep support for compressed-tensors exports that place the scale name
// between the module path and weight suffix.
base := strings.TrimSuffix(key, ".weight")
candidates = appendUnique(candidates, base+".weight_scale")
candidates = appendUnique(candidates, base+".weight_scale_inv")
}
for _, candidate := range candidates {
if value, ok := headers[candidate]; ok && value.Type != "" {
if scaleKey != "" {
return "", safetensorMetadata{}, false, fmt.Errorf("multiple fp8 scale companions for tensor %q: %q and %q", key, scaleKey, candidate)
}
scaleKey = candidate
scaleValue = value
}
}
if scaleKey == "" {
return "", safetensorMetadata{}, false, nil
}
return scaleKey, scaleValue, true, nil
}
func safetensorsFP8ScaleCandidates(key string) []string {
var candidates []string
candidates = appendUnique(candidates, key+"_scale")
candidates = appendUnique(candidates, key+"_scale_inv")
candidates = appendUnique(candidates, key+".scale")
candidates = appendUnique(candidates, key+".scale_inv")
return candidates
}
func appendUnique(values []string, value string) []string {
if !slices.Contains(values, value) {
values = append(values, value)
}
return values
}
type safetensorFP8BlockSize struct {
rows int
cols int
ok bool
}
type safetensorsSourceQuantization struct {
QuantMethod string `json:"quant_method"`
Format string `json:"format"`
WeightBlockSize []int `json:"weight_block_size"`
ConfigGroups map[string]struct {
Format string `json:"format"`
Weights struct {
BlockStructure []int `json:"block_structure"`
NumBits int `json:"num_bits"`
Type string `json:"type"`
} `json:"weights"`
} `json:"config_groups"`
}
type safetensorsModelConfig struct {
Quantization safetensorsSourceQuantization `json:"quantization"`
QuantizationConfig safetensorsSourceQuantization `json:"quantization_config"`
CompressionConfig safetensorsSourceQuantization `json:"compression_config"`
TextConfig struct {
Quantization safetensorsSourceQuantization `json:"quantization"`
QuantizationConfig safetensorsSourceQuantization `json:"quantization_config"`
CompressionConfig safetensorsSourceQuantization `json:"compression_config"`
} `json:"text_config"`
}
func safetensorsFP8BlockSize(fsys fs.FS) (safetensorFP8BlockSize, error) {
bts, err := fs.ReadFile(fsys, "config.json")
if errors.Is(err, fs.ErrNotExist) {
return safetensorFP8BlockSize{}, nil
}
if err != nil {
return safetensorFP8BlockSize{}, err
}
bts = sanitizeNonFiniteJSON(bts)
var cfg safetensorsModelConfig
if err := json.Unmarshal(bts, &cfg); err != nil {
return safetensorFP8BlockSize{}, fmt.Errorf("parse config.json fp8 metadata: %w", err)
}
var blocks []safetensorFP8BlockSize
for _, q := range []safetensorsSourceQuantization{
cfg.Quantization,
cfg.QuantizationConfig,
cfg.CompressionConfig,
cfg.TextConfig.Quantization,
cfg.TextConfig.QuantizationConfig,
cfg.TextConfig.CompressionConfig,
} {
if strings.EqualFold(q.QuantMethod, "fp8") && len(q.WeightBlockSize) == 2 {
block, err := newSafetensorFP8BlockSize(q.WeightBlockSize[0], q.WeightBlockSize[1])
if err != nil {
return safetensorFP8BlockSize{}, err
}
blocks = append(blocks, block)
}
if !strings.EqualFold(q.QuantMethod, "compressed-tensors") && !strings.EqualFold(q.Format, "float-quantized") {
continue
}
for _, group := range q.ConfigGroups {
if !strings.EqualFold(group.Format, "float-quantized") ||
group.Weights.NumBits != 8 ||
!strings.EqualFold(group.Weights.Type, "float") ||
len(group.Weights.BlockStructure) != 2 {
continue
}
block, err := newSafetensorFP8BlockSize(group.Weights.BlockStructure[0], group.Weights.BlockStructure[1])
if err != nil {
return safetensorFP8BlockSize{}, err
}
blocks = append(blocks, block)
}
}
if len(blocks) == 0 {
return safetensorFP8BlockSize{}, nil
}
block := blocks[0]
for _, other := range blocks[1:] {
if other.rows != block.rows || other.cols != block.cols {
return safetensorFP8BlockSize{}, fmt.Errorf("multiple fp8 block sizes in config.json: %dx%d and %dx%d", block.rows, block.cols, other.rows, other.cols)
}
}
return block, nil
}
func newSafetensorFP8BlockSize(rows, cols int) (safetensorFP8BlockSize, error) {
if rows <= 0 || cols <= 0 {
return safetensorFP8BlockSize{}, fmt.Errorf("invalid fp8 block size %dx%d", rows, cols)
}
return safetensorFP8BlockSize{rows: rows, cols: cols, ok: true}, nil
}
func (st safetensor) decodeFP8E4M3(data []byte) ([]float32, error) {
if st.scale == nil {
return nil, fmt.Errorf("missing fp8 scale companion for tensor %q", st.name)
}
if !st.fp8Block.ok {
return nil, fmt.Errorf("missing fp8 block size metadata for tensor %q", st.name)
}
if len(st.shape) != 2 {
return nil, fmt.Errorf("expected 2D fp8 tensor %q, got shape %v", st.name, st.shape)
}
rows, cols := int(st.shape[0]), int(st.shape[1])
if rows < 0 || cols < 0 || rows*cols != len(data) {
return nil, fmt.Errorf("fp8 tensor %q shape %v does not match %d bytes", st.name, st.shape, len(data))
}
scale, err := st.readScale()
if err != nil {
return nil, err
}
if len(st.scale.shape) != 2 {
return nil, fmt.Errorf("expected 2D fp8 scale tensor %q, got shape %v", st.scale.name, st.scale.shape)
}
blockRows := st.fp8Block.rows
blockCols := st.fp8Block.cols
scaleRows, scaleCols := int(st.scale.shape[0]), int(st.scale.shape[1])
expectedRows := (rows + blockRows - 1) / blockRows
expectedCols := (cols + blockCols - 1) / blockCols
if scaleRows != expectedRows || scaleCols != expectedCols {
return nil, fmt.Errorf("unexpected fp8 scale shape %v for tensor %q shape %v; want [%d %d]", st.scale.shape, st.name, st.shape, expectedRows, expectedCols)
}
if len(scale) != scaleRows*scaleCols {
return nil, fmt.Errorf("fp8 scale tensor %q shape %v does not match decoded length %d", st.scale.name, st.scale.shape, len(scale))
}
f32s := make([]float32, len(data))
for r := range rows {
scaleRow := r / blockRows
rowOffset := r * cols
for c := range cols {
f32s[rowOffset+c] = decodeFloat8E4M3FN(data[rowOffset+c]) * scale[scaleRow*scaleCols+c/blockCols]
}
}
return f32s, nil
}
func (st safetensor) readScale() ([]float32, error) {
r, err := st.sectionReader(st.scale.offset, st.scale.size)
if err != nil {
return nil, fmt.Errorf("failed to read fp8 scale tensor %q: %w", st.scale.name, err)
}
if closer, ok := r.(io.Closer); ok {
defer closer.Close()
}
br := bufio.NewReaderSize(r, min(32<<10, int(st.scale.size)))
switch st.scale.dtype {
case "F32":
f32s := make([]float32, st.scale.size/4)
if err := binary.Read(br, binary.LittleEndian, f32s); err != nil {
return nil, err
}
return f32s, nil
case "F16":
u16s := make([]uint16, st.scale.size/2)
if err := binary.Read(br, binary.LittleEndian, u16s); err != nil {
return nil, err
}
f32s := make([]float32, len(u16s))
for i := range u16s {
f32s[i] = float16.Frombits(u16s[i]).Float32()
}
return f32s, nil
case "BF16":
u8s := make([]uint8, st.scale.size)
if err := binary.Read(br, binary.LittleEndian, u8s); err != nil {
return nil, err
}
return bfloat16.DecodeFloat32(u8s), nil
default:
return nil, fmt.Errorf("unsupported fp8 scale dtype %q for tensor %q", st.scale.dtype, st.scale.name)
}
}
func (st safetensor) sectionReader(offset, size int64) (io.Reader, error) {
f, err := st.fs.Open(st.path)
if err != nil {
return nil, err
}
if readerAt, ok := f.(io.ReaderAt); ok {
return &readCloserReader{
Reader: io.NewSectionReader(readerAt, offset, size),
Closer: f,
}, nil
}
if seeker, ok := f.(io.Seeker); ok {
if _, err := seeker.Seek(offset, io.SeekStart); err != nil {
f.Close()
return nil, err
}
return &readCloserReader{
Reader: io.LimitReader(f, size),
Closer: f,
}, nil
}
if _, err := io.CopyN(io.Discard, f, offset); err != nil {
f.Close()
return nil, err
}
return &readCloserReader{
Reader: io.LimitReader(f, size),
Closer: f,
}, nil
}
type readCloserReader struct {
io.Reader
io.Closer
}
func decodeFloat8E4M3FN(v byte) float32 {
sign := float32(1)
if v&0x80 != 0 {
sign = -1
}
exp := int((v >> 3) & 0x0f)
mant := int(v & 0x07)
if exp == 0 {
if mant == 0 {
return 0 * sign
}
return sign * float32(math.Ldexp(float64(mant)/8, -6))
}
if exp == 0x0f && mant == 0x07 {
return float32(math.NaN())
}
return sign * float32(math.Ldexp(1+float64(mant)/8, exp-7))
}

View file

@ -3,8 +3,10 @@ package convert
import (
"bytes"
"encoding/binary"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"github.com/d4l3k/go-bfloat16"
@ -231,6 +233,222 @@ func TestSafetensors(t *testing.T) {
}
}
func TestSafetensorWriteToFP8E4M3(t *testing.T) {
root, err := os.OpenRoot(t.TempDir())
if err != nil {
t.Fatal(err)
}
defer root.Close()
path := filepath.Base(t.Name())
f, err := root.Create(path)
if err != nil {
t.Fatal(err)
}
// E4M3FN encodings for 1.0, 2.0, 0.5, and -1.0.
if _, err := f.Write([]byte{0x38, 0x40, 0x30, 0xb8}); err != nil {
t.Fatal(err)
}
if _, err := f.Write(bfloat16.EncodeFloat32([]float32{2})); err != nil {
t.Fatal(err)
}
if err := f.Close(); err != nil {
t.Fatal(err)
}
st := safetensor{
fs: root.FS(),
path: path,
dtype: "F8_E4M3",
offset: 0,
size: 4,
fp8Block: safetensorFP8BlockSize{rows: 128, cols: 128, ok: true},
scale: &safetensorScale{
name: "linear.weight_scale",
dtype: "BF16",
shape: []uint64{1, 1},
offset: 4,
size: 2,
},
tensorBase: &tensorBase{
name: "linear.weight",
shape: []uint64{2, 2},
},
}
var b bytes.Buffer
if _, err := st.WriteTo(&b); err != nil {
t.Fatal(err)
}
want := bfloat16.EncodeFloat32([]float32{2, 4, 1, -2})
if diff := cmp.Diff(want, b.Bytes()); diff != "" {
t.Errorf("safetensor.WriteTo() mismatch (-want +got):\n%s", diff)
}
}
func TestSafetensorWriteToFP8E4M3UsesConfiguredBlockSize(t *testing.T) {
root, err := os.OpenRoot(t.TempDir())
if err != nil {
t.Fatal(err)
}
defer root.Close()
path := filepath.Base(t.Name())
f, err := root.Create(path)
if err != nil {
t.Fatal(err)
}
if _, err := f.Write(bytes.Repeat([]byte{0x38}, 12)); err != nil {
t.Fatal(err)
}
if _, err := f.Write(bfloat16.EncodeFloat32([]float32{1, 2, 3, 4})); err != nil {
t.Fatal(err)
}
if err := f.Close(); err != nil {
t.Fatal(err)
}
st := safetensor{
fs: root.FS(),
path: path,
dtype: "F8_E4M3",
offset: 0,
size: 12,
fp8Block: safetensorFP8BlockSize{rows: 2, cols: 3, ok: true},
scale: &safetensorScale{
name: "linear.weight_scale",
dtype: "BF16",
shape: []uint64{2, 2},
offset: 12,
size: 8,
},
tensorBase: &tensorBase{
name: "linear.weight",
shape: []uint64{3, 4},
},
}
var b bytes.Buffer
if _, err := st.WriteTo(&b); err != nil {
t.Fatal(err)
}
want := bfloat16.EncodeFloat32([]float32{
1, 1, 1, 2,
1, 1, 1, 2,
3, 3, 3, 4,
})
if diff := cmp.Diff(want, b.Bytes()); diff != "" {
t.Errorf("safetensor.WriteTo() mismatch (-want +got):\n%s", diff)
}
}
func TestParseSafetensorsConsumesFP8ScaleCompanion(t *testing.T) {
tempDir := t.TempDir()
generateSafetensorTestData(t, tempDir, map[string]*tensorData{
"linear.weight": {
Offsets: []int{0, 4},
Type: "F8_E4M3",
Shape: []int{2, 2},
},
"linear.weight_scale": {
Offsets: []int{4, 6},
Type: "BF16",
Shape: []int{1, 1},
},
})
writeFP8BlockConfig(t, tempDir, 128, 128)
tensors, err := parseSafetensors(os.DirFS(tempDir), strings.NewReplacer(), "model-00001-of-00001.safetensors")
if err != nil {
t.Fatal(err)
}
if len(tensors) != 1 {
t.Fatalf("expected one tensor, got %d", len(tensors))
}
if got := tensors[0].Name(); got != "linear.weight" {
t.Fatalf("unexpected tensor name %q", got)
}
if got := tensors[0].Kind(); got != tensorKindBF16 {
t.Fatalf("unexpected fp8 converted kind %d, want %d", got, tensorKindBF16)
}
}
func TestParseSafetensorsRejectsFP8WithoutBlockMetadata(t *testing.T) {
tempDir := t.TempDir()
generateSafetensorTestData(t, tempDir, map[string]*tensorData{
"linear.weight": {
Offsets: []int{0, 4},
Type: "F8_E4M3",
Shape: []int{2, 2},
},
"linear.weight_scale": {
Offsets: []int{4, 6},
Type: "BF16",
Shape: []int{1, 1},
},
})
_, err := parseSafetensors(os.DirFS(tempDir), strings.NewReplacer(), "model-00001-of-00001.safetensors")
if err == nil || !strings.Contains(err.Error(), "missing fp8 block size metadata") {
t.Fatalf("expected missing fp8 block size metadata error, got %v", err)
}
}
func TestParseSafetensorsRejectsAmbiguousFP8ScaleCompanion(t *testing.T) {
tempDir := t.TempDir()
generateSafetensorTestData(t, tempDir, map[string]*tensorData{
"linear.weight": {
Offsets: []int{0, 4},
Type: "F8_E4M3",
Shape: []int{2, 2},
},
"linear.weight_scale": {
Offsets: []int{4, 6},
Type: "BF16",
Shape: []int{1, 1},
},
"linear.weight.scale": {
Offsets: []int{6, 8},
Type: "BF16",
Shape: []int{1, 1},
},
})
writeFP8BlockConfig(t, tempDir, 128, 128)
_, err := parseSafetensors(os.DirFS(tempDir), strings.NewReplacer(), "model-00001-of-00001.safetensors")
if err == nil || !strings.Contains(err.Error(), "multiple fp8 scale companions") {
t.Fatalf("expected ambiguous fp8 scale companion error, got %v", err)
}
}
func writeFP8BlockConfig(t *testing.T, dir string, rows, cols int) {
t.Helper()
config := fmt.Sprintf(`{
"architectures": ["GenericForCausalLM"],
"compression_config": {
"format": "float-quantized",
"config_groups": {
"group_0": {
"format": "float-quantized",
"weights": {
"type": "float",
"num_bits": 8,
"block_structure": [%d, %d]
}
}
}
}
}`, rows, cols)
if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(config), 0o644); err != nil {
t.Fatal(err)
}
}
func TestSafetensorKind(t *testing.T) {
tests := []struct {
name string

View file

@ -5,6 +5,7 @@ import (
"errors"
"io"
"iter"
"maps"
"path"
"slices"
"strconv"
@ -153,3 +154,54 @@ func (g mergeGroup) WriteTo(w io.Writer) (int64, error) {
return 0, nil
}
func sourceTensorKV(ts []*ggml.Tensor) KV {
sourceFP8 := make(map[string]struct{})
for _, t := range ts {
if writerSourceDType(t.WriterTo) == "F8_E4M3" {
sourceFP8[t.Name] = struct{}{}
}
}
if len(sourceFP8) == 0 {
return nil
}
return KV{
"source_quantization": "hf_fp8",
"source_fp8_tensors": slices.Sorted(maps.Keys(sourceFP8)),
}
}
type sourceDTypeTensor interface {
SourceDType() string
}
func writerSourceDType(w io.WriterTo) string {
switch w := w.(type) {
case sourceDTypeTensor:
return w.SourceDType()
case mergeGroup:
if len(w) == 0 {
return ""
}
dtype := sourceDType(w[0])
if dtype == "" {
return ""
}
for _, t := range w[1:] {
if sourceDType(t) != dtype {
return ""
}
}
return dtype
default:
return ""
}
}
func sourceDType(t Tensor) string {
if t, ok := t.(sourceDTypeTensor); ok {
return t.SourceDType()
}
return ""
}

View file

@ -21,7 +21,8 @@ type fakeTensor struct {
shape []uint64
data []float32
repacker Repacker
sourceDType string
repacker Repacker
}
func (f fakeTensor) Name() string {
@ -36,16 +37,21 @@ func (f fakeTensor) Kind() uint32 {
return 0
}
func (f fakeTensor) SourceDType() string {
return f.sourceDType
}
func (f *fakeTensor) SetRepacker(fn Repacker) {
f.repacker = fn
}
func (f fakeTensor) Clone() Tensor {
return &fakeTensor{
name: f.name,
shape: slices.Clone(f.shape),
data: slices.Clone(f.data),
repacker: f.repacker,
name: f.name,
shape: slices.Clone(f.shape),
data: slices.Clone(f.data),
sourceDType: f.sourceDType,
repacker: f.repacker,
}
}
@ -995,3 +1001,43 @@ func TestMergeOrder(t *testing.T) {
})
}
}
func TestSourceTensorKVRecordsFP8OutputTensors(t *testing.T) {
fp8 := &fakeTensor{name: "linear.weight", shape: []uint64{2, 2}, sourceDType: "F8_E4M3"}
bf16 := &fakeTensor{name: "other.weight", shape: []uint64{2, 2}, sourceDType: "BF16"}
kv := sourceTensorKV([]*ggml.Tensor{
{Name: "blk.0.linear.weight", WriterTo: fp8},
{Name: "blk.0.other.weight", WriterTo: bf16},
})
if got := kv["source_quantization"]; got != "hf_fp8" {
t.Fatalf("source_quantization = %v, want hf_fp8", got)
}
got, ok := kv["source_fp8_tensors"].([]string)
if !ok {
t.Fatalf("source_fp8_tensors = %#v, want []string", kv["source_fp8_tensors"])
}
if diff := cmp.Diff([]string{"blk.0.linear.weight"}, got); diff != "" {
t.Fatalf("source_fp8_tensors mismatch (-want +got):\n%s", diff)
}
}
func TestSourceTensorKVRecordsMergedFP8OutputTensors(t *testing.T) {
fp8A := &fakeTensor{name: "expert.0.weight", shape: []uint64{2, 2}, sourceDType: "F8_E4M3"}
fp8B := &fakeTensor{name: "expert.1.weight", shape: []uint64{2, 2}, sourceDType: "F8_E4M3"}
bf16 := &fakeTensor{name: "expert.2.weight", shape: []uint64{2, 2}, sourceDType: "BF16"}
kv := sourceTensorKV([]*ggml.Tensor{
{Name: "ffn_exps.weight", WriterTo: mergeGroup{fp8A, fp8B}},
{Name: "mixed_exps.weight", WriterTo: mergeGroup{fp8A, bf16}},
})
got, ok := kv["source_fp8_tensors"].([]string)
if !ok {
t.Fatalf("source_fp8_tensors = %#v, want []string", kv["source_fp8_tensors"])
}
if diff := cmp.Diff([]string{"ffn_exps.weight"}, got); diff != "" {
t.Fatalf("source_fp8_tensors mismatch (-want +got):\n%s", diff)
}
}

View file

@ -494,15 +494,18 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
for _, layer := range baseLayers {
if layer.GGML != nil {
quantType := strings.ToUpper(cmp.Or(r.Quantize, r.Quantization))
ft := layer.GGML.KV().FileType()
if quantType == "" && hasSourceFP8Tensors(layer.GGML.KV()) && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" && slices.Contains([]string{"F16", "BF16", "F32"}, ft.String()) {
quantType = "Q8_0"
}
if quantType != "" && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" {
want, err := ggml.ParseFileType(quantType)
if err != nil {
return err
}
ft := layer.GGML.KV().FileType()
if !slices.Contains([]string{"F16", "F32"}, ft.String()) {
return errors.New("quantization is only supported for F16 and F32 models")
if !slices.Contains([]string{"F16", "BF16", "F32"}, ft.String()) {
return errors.New("quantization is only supported for F16, BF16 and F32 models")
} else if ft != want {
layer, err = quantizeLayer(layer, quantType, fn)
if err != nil {
@ -606,6 +609,10 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
return nil
}
func hasSourceFP8Tensors(kv ggml.KV) bool {
return kv.String("source_quantization") == "hf_fp8" && len(kv.Strings("source_fp8_tensors")) > 0
}
func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) {
ft := layer.GGML.KV().FileType()
var doneBytes atomic.Uint64

View file

@ -51,11 +51,14 @@ func (q quantizer) WriteTo(w io.Writer) (int64, error) {
}
type quantizeState struct {
nAttnV int // Number of attn_*v* weight tensors
nFfnDown int // Number of ffn_down tensors
iAttnV int // Running counter of number of attn_v tensors that have been processed
iFfnDown int // Running counter of number of ffn_down tensors that have been processed
hasOutput bool // used to figure out if a model shares tok_embd with the output weight
nAttnV int // Number of attn_*v* weight tensors
nFfnDown int // Number of ffn_down tensors
iAttnV int // Running counter of number of attn_v tensors that have been processed
iFfnDown int // Running counter of number of ffn_down tensors that have been processed
hasOutput bool // used to figure out if a model shares tok_embd with the output weight
preserveSourceFP8ToQ8 bool
preserveSourceQ4 bool
sourceFP8Tensors map[string]struct{}
}
func useMoreBits(iLayer, nLayers int) bool {
@ -120,10 +123,10 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
newType = fsggml.TensorTypeQ6_K
}
} else if strings.Contains(name, "attn_v.weight") {
if (ftype == fsggml.FileTypeQ4_K_M) &&
if newType != fsggml.TensorTypeQ8_0 && (ftype == fsggml.FileTypeQ4_K_M) &&
useMoreBits(qs.iAttnV, qs.nAttnV) {
newType = fsggml.TensorTypeQ6_K
} else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
} else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
newType = fsggml.TensorTypeQ5_K
}
@ -164,21 +167,21 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
qs.iFfnDown++
}
n_layer := qs.nFfnDown
if ftype == fsggml.FileTypeQ4_K_M {
if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M {
if useMoreBits(iLayer, n_layer) {
newType = fsggml.TensorTypeQ6_K
}
} else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
} else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
newType = fsggml.TensorTypeQ5_K
}
} else if strings.Contains(name, "attn_output.weight") {
if nExperts == 8 {
if newType != fsggml.TensorTypeQ8_0 && nExperts == 8 {
if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
newType = fsggml.TensorTypeQ5_K
}
}
} else if strings.Contains(name, "attn_qkv.weight") {
if ftype == fsggml.FileTypeQ4_K_M {
if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M {
newType = fsggml.TensorTypeQ5_K
}
}
@ -218,7 +221,12 @@ func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType,
kv := maps.Clone(orig.KV())
kv["general.file_type"] = newFileType
// kv["general.quantization_version"] = ggml.QuantizationVersion()
qs := &quantizeState{}
qs := &quantizeState{
sourceFP8Tensors: sourceFP8TensorSet(kv),
}
hasSourceFP8 := hasSourceFP8Tensors(kv)
qs.preserveSourceFP8ToQ8 = hasSourceFP8 && newFileType == fsggml.FileTypeQ8_0
qs.preserveSourceQ4 = hasSourceFP8 && slices.Contains([]fsggml.FileType{fsggml.FileTypeQ4_K_M, fsggml.FileTypeQ4_K_S}, newFileType)
// Build up the quantize state so newType can adjust types
layerCount := 0
for k, l := range orig.Tensors().GroupLayers() {
@ -304,6 +312,12 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
newType := fsggml.TensorType(t.Kind)
if quantize {
if qs.preserveSourceFP8ToQ8 {
if _, ok := qs.sourceFP8Tensors[name]; !ok {
return newType
}
}
if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
if qt, ok := qwen3LinearAttnQuantType(name); ok {
return qt
@ -311,6 +325,11 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
}
// get more optimal quantization type based on the tensor shape, layer, etc.
if qs.preserveSourceQ4 {
if _, ok := qs.sourceFP8Tensors[name]; !ok {
defaultType = fsggml.TensorTypeQ8_0
}
}
newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
if newType != defaultType {
slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType)
@ -318,3 +337,16 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
}
return newType
}
func sourceFP8TensorSet(kv fsggml.KV) map[string]struct{} {
names := kv.Strings("source_fp8_tensors")
if len(names) == 0 {
return nil
}
out := make(map[string]struct{}, len(names))
for _, name := range names {
out[name] = struct{}{}
}
return out
}

View file

@ -308,6 +308,95 @@ func TestQuantizeModel(t *testing.T) {
"output.weight": fsggml.TensorTypeQ8_0,
},
},
{
name: "source_fp8_q8_preserves_bf16_tensors",
kv: map[string]any{
"general.architecture": "test",
"source_quantization": "hf_fp8",
"source_fp8_tensors": []string{"blk.1.ffn_down_exps.weight"},
},
tensors: []*fsggml.Tensor{
{
Name: "blk.1.ffn_down_exps.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "blk.1.attn_q.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
},
newType: "Q8_0",
expectedTensorTypes: map[string]fsggml.TensorType{
"blk.1.ffn_down_exps.weight": fsggml.TensorTypeQ8_0,
"blk.1.attn_q.weight": fsggml.TensorTypeBF16,
},
},
{
name: "source_fp8_q4_promotes_bf16_tensors_to_q8",
kv: map[string]any{
"general.architecture": "test",
"source_quantization": "hf_fp8",
"source_fp8_tensors": []string{
"blk.1.ffn_gate_exps.weight",
"blk.1.ffn_down_exps.weight",
},
},
tensors: []*fsggml.Tensor{
{
Name: "blk.1.ffn_gate_exps.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "blk.1.ffn_down_exps.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "blk.1.attn_q.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "blk.1.attn_v.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "blk.1.ffn_down.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "blk.1.attn_q_norm.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "blk.1.ffn_gate_inp.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
{
Name: "output.weight", Kind: uint32(fsggml.TensorTypeBF16),
Offset: uint64(0), Shape: []uint64{256, 1},
WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]),
},
},
newType: "Q4_K_M",
expectedTensorTypes: map[string]fsggml.TensorType{
"blk.1.ffn_gate_exps.weight": fsggml.TensorTypeQ4_K,
"blk.1.ffn_down_exps.weight": fsggml.TensorTypeQ6_K,
"blk.1.attn_q.weight": fsggml.TensorTypeQ8_0,
"blk.1.attn_v.weight": fsggml.TensorTypeQ8_0,
"blk.1.ffn_down.weight": fsggml.TensorTypeQ8_0,
"blk.1.attn_q_norm.weight": fsggml.TensorTypeBF16,
"blk.1.ffn_gate_inp.weight": fsggml.TensorTypeBF16,
"output.weight": fsggml.TensorTypeQ8_0,
},
},
{
name: "f32_short_data",
kv: map[string]any{