diff --git a/convert/convert.go b/convert/convert.go index 876ac54c0..0f3d5f772 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -387,6 +387,10 @@ func ConvertModel(fsys fs.FS, f *os.File) error { } func writeFile(f *os.File, kv KV, ts []*ggml.Tensor) error { + for k, v := range sourceTensorKV(ts) { + kv[k] = v + } + for i := range ts { ts[i].Shape = slices.Clone(ts[i].Shape) slices.Reverse(ts[i].Shape) diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go index 6127ab566..b67fe4fcd 100644 --- a/convert/reader_safetensors.go +++ b/convert/reader_safetensors.go @@ -5,10 +5,12 @@ import ( "bytes" "encoding/binary" "encoding/json" + "errors" "fmt" "io" "io/fs" "maps" + "math" "slices" "strings" @@ -23,6 +25,11 @@ type safetensorMetadata struct { } func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { + fp8Block, err := safetensorsFP8BlockSize(fsys) + if err != nil { + return nil, err + } + var ts []Tensor for _, p := range ps { f, err := fsys.Open(p) @@ -50,24 +57,47 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T names := make(map[string]struct{}, len(keys)) + fp8Scales, err := collectSafetensorsFP8Scales(n, headers) + if err != nil { + return nil, err + } + for _, key := range keys { if value := headers[key]; value.Type != "" { + if _, ok := fp8Scales.consumed[key]; ok { + continue + } + // Scalar tensors (e.g. clipped linear min/max) are 0-dim in safetensors. // Promote them to 1-dim so they can be stored in GGUF. if len(value.Shape) == 0 { value.Shape = []uint64{1} } + + var scale *safetensorScale + if value.Type == "F8_E4M3" { + if !fp8Block.ok { + return nil, fmt.Errorf("missing fp8 block size metadata for tensor %q", key) + } + scale = fp8Scales.byWeight[key] + if scale == nil { + return nil, fmt.Errorf("missing fp8 scale companion for tensor %q", key) + } + } + ggufName := replacer.Replace(key) if _, ok := names[ggufName]; ok { return nil, fmt.Errorf("duplicate tensor name '%s' was found for this model", ggufName) } names[ggufName] = struct{}{} ts = append(ts, safetensor{ - fs: fsys, - path: p, - dtype: value.Type, - offset: safetensorsPad(n, value.Offsets[0]), - size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), + fs: fsys, + path: p, + dtype: value.Type, + offset: safetensorsPad(n, value.Offsets[0]), + size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), + scale: scale, + fp8Block: fp8Block, tensorBase: &tensorBase{ name: ggufName, shape: value.Shape, @@ -85,12 +115,22 @@ func safetensorsPad(n, offset int64) int64 { return 8 + n + offset } -type safetensor struct { - fs fs.FS - path string +type safetensorScale struct { + name string dtype string + shape []uint64 offset int64 size int64 +} + +type safetensor struct { + fs fs.FS + path string + dtype string + offset int64 + size int64 + scale *safetensorScale + fp8Block safetensorFP8BlockSize *tensorBase } @@ -104,17 +144,26 @@ func (st safetensor) Kind() uint32 { kind != tensorKindFP32 { kind = tensorKindBF16 } + if st.dtype == "F8_E4M3" && kind != tensorKindFP32 { + kind = tensorKindBF16 + } return kind } +func (st safetensor) SourceDType() string { + return st.dtype +} + func (st safetensor) Clone() Tensor { return &safetensor{ - fs: st.fs, - path: st.path, - dtype: st.dtype, - offset: st.offset, - size: st.size, + fs: st.fs, + path: st.path, + dtype: st.dtype, + offset: st.offset, + size: st.size, + scale: st.scale.Clone(), + fp8Block: st.fp8Block, tensorBase: &tensorBase{ name: st.name, repacker: st.repacker, @@ -123,6 +172,19 @@ func (st safetensor) Clone() Tensor { } } +func (ss *safetensorScale) Clone() *safetensorScale { + if ss == nil { + return nil + } + return &safetensorScale{ + name: ss.name, + dtype: ss.dtype, + shape: slices.Clone(ss.shape), + offset: ss.offset, + size: ss.size, + } +} + func (st safetensor) WriteTo(w io.Writer) (int64, error) { f, err := st.fs.Open(st.path) if err != nil { @@ -180,6 +242,16 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) { } f32s = bfloat16.DecodeFloat32(u8s) + case "F8_E4M3": + u8s := make([]uint8, st.size) + if err = binary.Read(br, binary.LittleEndian, u8s); err != nil { + return 0, err + } + + f32s, err = st.decodeFP8E4M3(u8s) + if err != nil { + return 0, err + } default: return 0, fmt.Errorf("unknown data type: %s", st.dtype) } @@ -208,3 +280,334 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) { return 0, fmt.Errorf("unknown storage type: %d", st.Kind()) } } + +type safetensorsFP8Scales struct { + byWeight map[string]*safetensorScale + consumed map[string]struct{} +} + +func collectSafetensorsFP8Scales(n int64, headers map[string]safetensorMetadata) (safetensorsFP8Scales, error) { + scales := safetensorsFP8Scales{ + byWeight: make(map[string]*safetensorScale), + consumed: make(map[string]struct{}), + } + + for key, value := range headers { + if value.Type != "F8_E4M3" { + continue + } + + scaleKey, scaleValue, ok, err := safetensorsFP8Scale(key, headers) + if err != nil { + return safetensorsFP8Scales{}, err + } + if !ok { + continue + } + if _, ok := scales.consumed[scaleKey]; ok { + return safetensorsFP8Scales{}, fmt.Errorf("fp8 scale companion %q is used by multiple tensors", scaleKey) + } + + scales.byWeight[key] = &safetensorScale{ + name: scaleKey, + dtype: scaleValue.Type, + shape: slices.Clone(scaleValue.Shape), + offset: safetensorsPad(n, scaleValue.Offsets[0]), + size: safetensorsPad(n, scaleValue.Offsets[1]) - safetensorsPad(n, scaleValue.Offsets[0]), + } + scales.consumed[scaleKey] = struct{}{} + } + + return scales, nil +} + +func safetensorsFP8Scale(key string, headers map[string]safetensorMetadata) (string, safetensorMetadata, bool, error) { + candidates := safetensorsFP8ScaleCandidates(key) + + var scaleKey string + var scaleValue safetensorMetadata + if strings.HasSuffix(key, ".weight") { + // Keep support for compressed-tensors exports that place the scale name + // between the module path and weight suffix. + base := strings.TrimSuffix(key, ".weight") + candidates = appendUnique(candidates, base+".weight_scale") + candidates = appendUnique(candidates, base+".weight_scale_inv") + } + + for _, candidate := range candidates { + if value, ok := headers[candidate]; ok && value.Type != "" { + if scaleKey != "" { + return "", safetensorMetadata{}, false, fmt.Errorf("multiple fp8 scale companions for tensor %q: %q and %q", key, scaleKey, candidate) + } + scaleKey = candidate + scaleValue = value + } + } + if scaleKey == "" { + return "", safetensorMetadata{}, false, nil + } + + return scaleKey, scaleValue, true, nil +} + +func safetensorsFP8ScaleCandidates(key string) []string { + var candidates []string + candidates = appendUnique(candidates, key+"_scale") + candidates = appendUnique(candidates, key+"_scale_inv") + candidates = appendUnique(candidates, key+".scale") + candidates = appendUnique(candidates, key+".scale_inv") + return candidates +} + +func appendUnique(values []string, value string) []string { + if !slices.Contains(values, value) { + values = append(values, value) + } + return values +} + +type safetensorFP8BlockSize struct { + rows int + cols int + ok bool +} + +type safetensorsSourceQuantization struct { + QuantMethod string `json:"quant_method"` + Format string `json:"format"` + WeightBlockSize []int `json:"weight_block_size"` + ConfigGroups map[string]struct { + Format string `json:"format"` + Weights struct { + BlockStructure []int `json:"block_structure"` + NumBits int `json:"num_bits"` + Type string `json:"type"` + } `json:"weights"` + } `json:"config_groups"` +} + +type safetensorsModelConfig struct { + Quantization safetensorsSourceQuantization `json:"quantization"` + QuantizationConfig safetensorsSourceQuantization `json:"quantization_config"` + CompressionConfig safetensorsSourceQuantization `json:"compression_config"` + TextConfig struct { + Quantization safetensorsSourceQuantization `json:"quantization"` + QuantizationConfig safetensorsSourceQuantization `json:"quantization_config"` + CompressionConfig safetensorsSourceQuantization `json:"compression_config"` + } `json:"text_config"` +} + +func safetensorsFP8BlockSize(fsys fs.FS) (safetensorFP8BlockSize, error) { + bts, err := fs.ReadFile(fsys, "config.json") + if errors.Is(err, fs.ErrNotExist) { + return safetensorFP8BlockSize{}, nil + } + if err != nil { + return safetensorFP8BlockSize{}, err + } + bts = sanitizeNonFiniteJSON(bts) + + var cfg safetensorsModelConfig + if err := json.Unmarshal(bts, &cfg); err != nil { + return safetensorFP8BlockSize{}, fmt.Errorf("parse config.json fp8 metadata: %w", err) + } + + var blocks []safetensorFP8BlockSize + for _, q := range []safetensorsSourceQuantization{ + cfg.Quantization, + cfg.QuantizationConfig, + cfg.CompressionConfig, + cfg.TextConfig.Quantization, + cfg.TextConfig.QuantizationConfig, + cfg.TextConfig.CompressionConfig, + } { + if strings.EqualFold(q.QuantMethod, "fp8") && len(q.WeightBlockSize) == 2 { + block, err := newSafetensorFP8BlockSize(q.WeightBlockSize[0], q.WeightBlockSize[1]) + if err != nil { + return safetensorFP8BlockSize{}, err + } + blocks = append(blocks, block) + } + + if !strings.EqualFold(q.QuantMethod, "compressed-tensors") && !strings.EqualFold(q.Format, "float-quantized") { + continue + } + for _, group := range q.ConfigGroups { + if !strings.EqualFold(group.Format, "float-quantized") || + group.Weights.NumBits != 8 || + !strings.EqualFold(group.Weights.Type, "float") || + len(group.Weights.BlockStructure) != 2 { + continue + } + block, err := newSafetensorFP8BlockSize(group.Weights.BlockStructure[0], group.Weights.BlockStructure[1]) + if err != nil { + return safetensorFP8BlockSize{}, err + } + blocks = append(blocks, block) + } + } + + if len(blocks) == 0 { + return safetensorFP8BlockSize{}, nil + } + + block := blocks[0] + for _, other := range blocks[1:] { + if other.rows != block.rows || other.cols != block.cols { + return safetensorFP8BlockSize{}, fmt.Errorf("multiple fp8 block sizes in config.json: %dx%d and %dx%d", block.rows, block.cols, other.rows, other.cols) + } + } + return block, nil +} + +func newSafetensorFP8BlockSize(rows, cols int) (safetensorFP8BlockSize, error) { + if rows <= 0 || cols <= 0 { + return safetensorFP8BlockSize{}, fmt.Errorf("invalid fp8 block size %dx%d", rows, cols) + } + return safetensorFP8BlockSize{rows: rows, cols: cols, ok: true}, nil +} + +func (st safetensor) decodeFP8E4M3(data []byte) ([]float32, error) { + if st.scale == nil { + return nil, fmt.Errorf("missing fp8 scale companion for tensor %q", st.name) + } + if !st.fp8Block.ok { + return nil, fmt.Errorf("missing fp8 block size metadata for tensor %q", st.name) + } + if len(st.shape) != 2 { + return nil, fmt.Errorf("expected 2D fp8 tensor %q, got shape %v", st.name, st.shape) + } + + rows, cols := int(st.shape[0]), int(st.shape[1]) + if rows < 0 || cols < 0 || rows*cols != len(data) { + return nil, fmt.Errorf("fp8 tensor %q shape %v does not match %d bytes", st.name, st.shape, len(data)) + } + + scale, err := st.readScale() + if err != nil { + return nil, err + } + + if len(st.scale.shape) != 2 { + return nil, fmt.Errorf("expected 2D fp8 scale tensor %q, got shape %v", st.scale.name, st.scale.shape) + } + + blockRows := st.fp8Block.rows + blockCols := st.fp8Block.cols + scaleRows, scaleCols := int(st.scale.shape[0]), int(st.scale.shape[1]) + expectedRows := (rows + blockRows - 1) / blockRows + expectedCols := (cols + blockCols - 1) / blockCols + if scaleRows != expectedRows || scaleCols != expectedCols { + return nil, fmt.Errorf("unexpected fp8 scale shape %v for tensor %q shape %v; want [%d %d]", st.scale.shape, st.name, st.shape, expectedRows, expectedCols) + } + if len(scale) != scaleRows*scaleCols { + return nil, fmt.Errorf("fp8 scale tensor %q shape %v does not match decoded length %d", st.scale.name, st.scale.shape, len(scale)) + } + + f32s := make([]float32, len(data)) + for r := range rows { + scaleRow := r / blockRows + rowOffset := r * cols + for c := range cols { + f32s[rowOffset+c] = decodeFloat8E4M3FN(data[rowOffset+c]) * scale[scaleRow*scaleCols+c/blockCols] + } + } + + return f32s, nil +} + +func (st safetensor) readScale() ([]float32, error) { + r, err := st.sectionReader(st.scale.offset, st.scale.size) + if err != nil { + return nil, fmt.Errorf("failed to read fp8 scale tensor %q: %w", st.scale.name, err) + } + if closer, ok := r.(io.Closer); ok { + defer closer.Close() + } + br := bufio.NewReaderSize(r, min(32<<10, int(st.scale.size))) + + switch st.scale.dtype { + case "F32": + f32s := make([]float32, st.scale.size/4) + if err := binary.Read(br, binary.LittleEndian, f32s); err != nil { + return nil, err + } + return f32s, nil + case "F16": + u16s := make([]uint16, st.scale.size/2) + if err := binary.Read(br, binary.LittleEndian, u16s); err != nil { + return nil, err + } + f32s := make([]float32, len(u16s)) + for i := range u16s { + f32s[i] = float16.Frombits(u16s[i]).Float32() + } + return f32s, nil + case "BF16": + u8s := make([]uint8, st.scale.size) + if err := binary.Read(br, binary.LittleEndian, u8s); err != nil { + return nil, err + } + return bfloat16.DecodeFloat32(u8s), nil + default: + return nil, fmt.Errorf("unsupported fp8 scale dtype %q for tensor %q", st.scale.dtype, st.scale.name) + } +} + +func (st safetensor) sectionReader(offset, size int64) (io.Reader, error) { + f, err := st.fs.Open(st.path) + if err != nil { + return nil, err + } + + if readerAt, ok := f.(io.ReaderAt); ok { + return &readCloserReader{ + Reader: io.NewSectionReader(readerAt, offset, size), + Closer: f, + }, nil + } + if seeker, ok := f.(io.Seeker); ok { + if _, err := seeker.Seek(offset, io.SeekStart); err != nil { + f.Close() + return nil, err + } + return &readCloserReader{ + Reader: io.LimitReader(f, size), + Closer: f, + }, nil + } + if _, err := io.CopyN(io.Discard, f, offset); err != nil { + f.Close() + return nil, err + } + return &readCloserReader{ + Reader: io.LimitReader(f, size), + Closer: f, + }, nil +} + +type readCloserReader struct { + io.Reader + io.Closer +} + +func decodeFloat8E4M3FN(v byte) float32 { + sign := float32(1) + if v&0x80 != 0 { + sign = -1 + } + + exp := int((v >> 3) & 0x0f) + mant := int(v & 0x07) + if exp == 0 { + if mant == 0 { + return 0 * sign + } + return sign * float32(math.Ldexp(float64(mant)/8, -6)) + } + if exp == 0x0f && mant == 0x07 { + return float32(math.NaN()) + } + + return sign * float32(math.Ldexp(1+float64(mant)/8, exp-7)) +} diff --git a/convert/reader_test.go b/convert/reader_test.go index c3d094f10..3788b65ec 100644 --- a/convert/reader_test.go +++ b/convert/reader_test.go @@ -3,8 +3,10 @@ package convert import ( "bytes" "encoding/binary" + "fmt" "os" "path/filepath" + "strings" "testing" "github.com/d4l3k/go-bfloat16" @@ -231,6 +233,222 @@ func TestSafetensors(t *testing.T) { } } +func TestSafetensorWriteToFP8E4M3(t *testing.T) { + root, err := os.OpenRoot(t.TempDir()) + if err != nil { + t.Fatal(err) + } + defer root.Close() + + path := filepath.Base(t.Name()) + f, err := root.Create(path) + if err != nil { + t.Fatal(err) + } + + // E4M3FN encodings for 1.0, 2.0, 0.5, and -1.0. + if _, err := f.Write([]byte{0x38, 0x40, 0x30, 0xb8}); err != nil { + t.Fatal(err) + } + if _, err := f.Write(bfloat16.EncodeFloat32([]float32{2})); err != nil { + t.Fatal(err) + } + if err := f.Close(); err != nil { + t.Fatal(err) + } + + st := safetensor{ + fs: root.FS(), + path: path, + dtype: "F8_E4M3", + offset: 0, + size: 4, + fp8Block: safetensorFP8BlockSize{rows: 128, cols: 128, ok: true}, + scale: &safetensorScale{ + name: "linear.weight_scale", + dtype: "BF16", + shape: []uint64{1, 1}, + offset: 4, + size: 2, + }, + tensorBase: &tensorBase{ + name: "linear.weight", + shape: []uint64{2, 2}, + }, + } + + var b bytes.Buffer + if _, err := st.WriteTo(&b); err != nil { + t.Fatal(err) + } + + want := bfloat16.EncodeFloat32([]float32{2, 4, 1, -2}) + if diff := cmp.Diff(want, b.Bytes()); diff != "" { + t.Errorf("safetensor.WriteTo() mismatch (-want +got):\n%s", diff) + } +} + +func TestSafetensorWriteToFP8E4M3UsesConfiguredBlockSize(t *testing.T) { + root, err := os.OpenRoot(t.TempDir()) + if err != nil { + t.Fatal(err) + } + defer root.Close() + + path := filepath.Base(t.Name()) + f, err := root.Create(path) + if err != nil { + t.Fatal(err) + } + + if _, err := f.Write(bytes.Repeat([]byte{0x38}, 12)); err != nil { + t.Fatal(err) + } + if _, err := f.Write(bfloat16.EncodeFloat32([]float32{1, 2, 3, 4})); err != nil { + t.Fatal(err) + } + if err := f.Close(); err != nil { + t.Fatal(err) + } + + st := safetensor{ + fs: root.FS(), + path: path, + dtype: "F8_E4M3", + offset: 0, + size: 12, + fp8Block: safetensorFP8BlockSize{rows: 2, cols: 3, ok: true}, + scale: &safetensorScale{ + name: "linear.weight_scale", + dtype: "BF16", + shape: []uint64{2, 2}, + offset: 12, + size: 8, + }, + tensorBase: &tensorBase{ + name: "linear.weight", + shape: []uint64{3, 4}, + }, + } + + var b bytes.Buffer + if _, err := st.WriteTo(&b); err != nil { + t.Fatal(err) + } + + want := bfloat16.EncodeFloat32([]float32{ + 1, 1, 1, 2, + 1, 1, 1, 2, + 3, 3, 3, 4, + }) + if diff := cmp.Diff(want, b.Bytes()); diff != "" { + t.Errorf("safetensor.WriteTo() mismatch (-want +got):\n%s", diff) + } +} + +func TestParseSafetensorsConsumesFP8ScaleCompanion(t *testing.T) { + tempDir := t.TempDir() + generateSafetensorTestData(t, tempDir, map[string]*tensorData{ + "linear.weight": { + Offsets: []int{0, 4}, + Type: "F8_E4M3", + Shape: []int{2, 2}, + }, + "linear.weight_scale": { + Offsets: []int{4, 6}, + Type: "BF16", + Shape: []int{1, 1}, + }, + }) + writeFP8BlockConfig(t, tempDir, 128, 128) + + tensors, err := parseSafetensors(os.DirFS(tempDir), strings.NewReplacer(), "model-00001-of-00001.safetensors") + if err != nil { + t.Fatal(err) + } + if len(tensors) != 1 { + t.Fatalf("expected one tensor, got %d", len(tensors)) + } + if got := tensors[0].Name(); got != "linear.weight" { + t.Fatalf("unexpected tensor name %q", got) + } + if got := tensors[0].Kind(); got != tensorKindBF16 { + t.Fatalf("unexpected fp8 converted kind %d, want %d", got, tensorKindBF16) + } +} + +func TestParseSafetensorsRejectsFP8WithoutBlockMetadata(t *testing.T) { + tempDir := t.TempDir() + generateSafetensorTestData(t, tempDir, map[string]*tensorData{ + "linear.weight": { + Offsets: []int{0, 4}, + Type: "F8_E4M3", + Shape: []int{2, 2}, + }, + "linear.weight_scale": { + Offsets: []int{4, 6}, + Type: "BF16", + Shape: []int{1, 1}, + }, + }) + + _, err := parseSafetensors(os.DirFS(tempDir), strings.NewReplacer(), "model-00001-of-00001.safetensors") + if err == nil || !strings.Contains(err.Error(), "missing fp8 block size metadata") { + t.Fatalf("expected missing fp8 block size metadata error, got %v", err) + } +} + +func TestParseSafetensorsRejectsAmbiguousFP8ScaleCompanion(t *testing.T) { + tempDir := t.TempDir() + generateSafetensorTestData(t, tempDir, map[string]*tensorData{ + "linear.weight": { + Offsets: []int{0, 4}, + Type: "F8_E4M3", + Shape: []int{2, 2}, + }, + "linear.weight_scale": { + Offsets: []int{4, 6}, + Type: "BF16", + Shape: []int{1, 1}, + }, + "linear.weight.scale": { + Offsets: []int{6, 8}, + Type: "BF16", + Shape: []int{1, 1}, + }, + }) + writeFP8BlockConfig(t, tempDir, 128, 128) + + _, err := parseSafetensors(os.DirFS(tempDir), strings.NewReplacer(), "model-00001-of-00001.safetensors") + if err == nil || !strings.Contains(err.Error(), "multiple fp8 scale companions") { + t.Fatalf("expected ambiguous fp8 scale companion error, got %v", err) + } +} + +func writeFP8BlockConfig(t *testing.T, dir string, rows, cols int) { + t.Helper() + + config := fmt.Sprintf(`{ + "architectures": ["GenericForCausalLM"], + "compression_config": { + "format": "float-quantized", + "config_groups": { + "group_0": { + "format": "float-quantized", + "weights": { + "type": "float", + "num_bits": 8, + "block_structure": [%d, %d] + } + } + } + } +}`, rows, cols) + if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(config), 0o644); err != nil { + t.Fatal(err) + } +} + func TestSafetensorKind(t *testing.T) { tests := []struct { name string diff --git a/convert/tensor.go b/convert/tensor.go index 68870744f..5dad64c8d 100644 --- a/convert/tensor.go +++ b/convert/tensor.go @@ -5,6 +5,7 @@ import ( "errors" "io" "iter" + "maps" "path" "slices" "strconv" @@ -153,3 +154,54 @@ func (g mergeGroup) WriteTo(w io.Writer) (int64, error) { return 0, nil } + +func sourceTensorKV(ts []*ggml.Tensor) KV { + sourceFP8 := make(map[string]struct{}) + for _, t := range ts { + if writerSourceDType(t.WriterTo) == "F8_E4M3" { + sourceFP8[t.Name] = struct{}{} + } + } + if len(sourceFP8) == 0 { + return nil + } + + return KV{ + "source_quantization": "hf_fp8", + "source_fp8_tensors": slices.Sorted(maps.Keys(sourceFP8)), + } +} + +type sourceDTypeTensor interface { + SourceDType() string +} + +func writerSourceDType(w io.WriterTo) string { + switch w := w.(type) { + case sourceDTypeTensor: + return w.SourceDType() + case mergeGroup: + if len(w) == 0 { + return "" + } + dtype := sourceDType(w[0]) + if dtype == "" { + return "" + } + for _, t := range w[1:] { + if sourceDType(t) != dtype { + return "" + } + } + return dtype + default: + return "" + } +} + +func sourceDType(t Tensor) string { + if t, ok := t.(sourceDTypeTensor); ok { + return t.SourceDType() + } + return "" +} diff --git a/convert/tensor_test.go b/convert/tensor_test.go index e0dc2350a..1dea476f2 100644 --- a/convert/tensor_test.go +++ b/convert/tensor_test.go @@ -21,7 +21,8 @@ type fakeTensor struct { shape []uint64 data []float32 - repacker Repacker + sourceDType string + repacker Repacker } func (f fakeTensor) Name() string { @@ -36,16 +37,21 @@ func (f fakeTensor) Kind() uint32 { return 0 } +func (f fakeTensor) SourceDType() string { + return f.sourceDType +} + func (f *fakeTensor) SetRepacker(fn Repacker) { f.repacker = fn } func (f fakeTensor) Clone() Tensor { return &fakeTensor{ - name: f.name, - shape: slices.Clone(f.shape), - data: slices.Clone(f.data), - repacker: f.repacker, + name: f.name, + shape: slices.Clone(f.shape), + data: slices.Clone(f.data), + sourceDType: f.sourceDType, + repacker: f.repacker, } } @@ -995,3 +1001,43 @@ func TestMergeOrder(t *testing.T) { }) } } + +func TestSourceTensorKVRecordsFP8OutputTensors(t *testing.T) { + fp8 := &fakeTensor{name: "linear.weight", shape: []uint64{2, 2}, sourceDType: "F8_E4M3"} + bf16 := &fakeTensor{name: "other.weight", shape: []uint64{2, 2}, sourceDType: "BF16"} + + kv := sourceTensorKV([]*ggml.Tensor{ + {Name: "blk.0.linear.weight", WriterTo: fp8}, + {Name: "blk.0.other.weight", WriterTo: bf16}, + }) + + if got := kv["source_quantization"]; got != "hf_fp8" { + t.Fatalf("source_quantization = %v, want hf_fp8", got) + } + got, ok := kv["source_fp8_tensors"].([]string) + if !ok { + t.Fatalf("source_fp8_tensors = %#v, want []string", kv["source_fp8_tensors"]) + } + if diff := cmp.Diff([]string{"blk.0.linear.weight"}, got); diff != "" { + t.Fatalf("source_fp8_tensors mismatch (-want +got):\n%s", diff) + } +} + +func TestSourceTensorKVRecordsMergedFP8OutputTensors(t *testing.T) { + fp8A := &fakeTensor{name: "expert.0.weight", shape: []uint64{2, 2}, sourceDType: "F8_E4M3"} + fp8B := &fakeTensor{name: "expert.1.weight", shape: []uint64{2, 2}, sourceDType: "F8_E4M3"} + bf16 := &fakeTensor{name: "expert.2.weight", shape: []uint64{2, 2}, sourceDType: "BF16"} + + kv := sourceTensorKV([]*ggml.Tensor{ + {Name: "ffn_exps.weight", WriterTo: mergeGroup{fp8A, fp8B}}, + {Name: "mixed_exps.weight", WriterTo: mergeGroup{fp8A, bf16}}, + }) + + got, ok := kv["source_fp8_tensors"].([]string) + if !ok { + t.Fatalf("source_fp8_tensors = %#v, want []string", kv["source_fp8_tensors"]) + } + if diff := cmp.Diff([]string{"ffn_exps.weight"}, got); diff != "" { + t.Fatalf("source_fp8_tensors mismatch (-want +got):\n%s", diff) + } +} diff --git a/server/create.go b/server/create.go index 9ddb2bf8b..a322471c5 100644 --- a/server/create.go +++ b/server/create.go @@ -494,15 +494,18 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, for _, layer := range baseLayers { if layer.GGML != nil { quantType := strings.ToUpper(cmp.Or(r.Quantize, r.Quantization)) + ft := layer.GGML.KV().FileType() + if quantType == "" && hasSourceFP8Tensors(layer.GGML.KV()) && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" && slices.Contains([]string{"F16", "BF16", "F32"}, ft.String()) { + quantType = "Q8_0" + } if quantType != "" && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" { want, err := ggml.ParseFileType(quantType) if err != nil { return err } - ft := layer.GGML.KV().FileType() - if !slices.Contains([]string{"F16", "F32"}, ft.String()) { - return errors.New("quantization is only supported for F16 and F32 models") + if !slices.Contains([]string{"F16", "BF16", "F32"}, ft.String()) { + return errors.New("quantization is only supported for F16, BF16 and F32 models") } else if ft != want { layer, err = quantizeLayer(layer, quantType, fn) if err != nil { @@ -606,6 +609,10 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, return nil } +func hasSourceFP8Tensors(kv ggml.KV) bool { + return kv.String("source_quantization") == "hf_fp8" && len(kv.Strings("source_fp8_tensors")) > 0 +} + func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.ProgressResponse)) (*layerGGML, error) { ft := layer.GGML.KV().FileType() var doneBytes atomic.Uint64 diff --git a/server/quantization.go b/server/quantization.go index ee70b5fc0..0b3af9702 100644 --- a/server/quantization.go +++ b/server/quantization.go @@ -51,11 +51,14 @@ func (q quantizer) WriteTo(w io.Writer) (int64, error) { } type quantizeState struct { - nAttnV int // Number of attn_*v* weight tensors - nFfnDown int // Number of ffn_down tensors - iAttnV int // Running counter of number of attn_v tensors that have been processed - iFfnDown int // Running counter of number of ffn_down tensors that have been processed - hasOutput bool // used to figure out if a model shares tok_embd with the output weight + nAttnV int // Number of attn_*v* weight tensors + nFfnDown int // Number of ffn_down tensors + iAttnV int // Running counter of number of attn_v tensors that have been processed + iFfnDown int // Running counter of number of ffn_down tensors that have been processed + hasOutput bool // used to figure out if a model shares tok_embd with the output weight + preserveSourceFP8ToQ8 bool + preserveSourceQ4 bool + sourceFP8Tensors map[string]struct{} } func useMoreBits(iLayer, nLayers int) bool { @@ -120,10 +123,10 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType newType = fsggml.TensorTypeQ6_K } } else if strings.Contains(name, "attn_v.weight") { - if (ftype == fsggml.FileTypeQ4_K_M) && + if newType != fsggml.TensorTypeQ8_0 && (ftype == fsggml.FileTypeQ4_K_M) && useMoreBits(qs.iAttnV, qs.nAttnV) { newType = fsggml.TensorTypeQ6_K - } else if ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 { + } else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 { newType = fsggml.TensorTypeQ5_K } @@ -164,21 +167,21 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType qs.iFfnDown++ } n_layer := qs.nFfnDown - if ftype == fsggml.FileTypeQ4_K_M { + if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M { if useMoreBits(iLayer, n_layer) { newType = fsggml.TensorTypeQ6_K } - } else if ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 { + } else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 { newType = fsggml.TensorTypeQ5_K } } else if strings.Contains(name, "attn_output.weight") { - if nExperts == 8 { + if newType != fsggml.TensorTypeQ8_0 && nExperts == 8 { if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M { newType = fsggml.TensorTypeQ5_K } } } else if strings.Contains(name, "attn_qkv.weight") { - if ftype == fsggml.FileTypeQ4_K_M { + if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M { newType = fsggml.TensorTypeQ5_K } } @@ -218,7 +221,12 @@ func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, kv := maps.Clone(orig.KV()) kv["general.file_type"] = newFileType // kv["general.quantization_version"] = ggml.QuantizationVersion() - qs := &quantizeState{} + qs := &quantizeState{ + sourceFP8Tensors: sourceFP8TensorSet(kv), + } + hasSourceFP8 := hasSourceFP8Tensors(kv) + qs.preserveSourceFP8ToQ8 = hasSourceFP8 && newFileType == fsggml.FileTypeQ8_0 + qs.preserveSourceQ4 = hasSourceFP8 && slices.Contains([]fsggml.FileType{fsggml.FileTypeQ4_K_M, fsggml.FileTypeQ4_K_S}, newFileType) // Build up the quantize state so newType can adjust types layerCount := 0 for k, l := range orig.Tensors().GroupLayers() { @@ -304,6 +312,12 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil newType := fsggml.TensorType(t.Kind) if quantize { + if qs.preserveSourceFP8ToQ8 { + if _, ok := qs.sourceFP8Tensors[name]; !ok { + return newType + } + } + if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) { if qt, ok := qwen3LinearAttnQuantType(name); ok { return qt @@ -311,6 +325,11 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil } // get more optimal quantization type based on the tensor shape, layer, etc. + if qs.preserveSourceQ4 { + if _, ok := qs.sourceFP8Tensors[name]; !ok { + defaultType = fsggml.TensorTypeQ8_0 + } + } newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype) if newType != defaultType { slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType) @@ -318,3 +337,16 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil } return newType } + +func sourceFP8TensorSet(kv fsggml.KV) map[string]struct{} { + names := kv.Strings("source_fp8_tensors") + if len(names) == 0 { + return nil + } + + out := make(map[string]struct{}, len(names)) + for _, name := range names { + out[name] = struct{}{} + } + return out +} diff --git a/server/quantization_test.go b/server/quantization_test.go index f8f10659c..49665c36f 100644 --- a/server/quantization_test.go +++ b/server/quantization_test.go @@ -308,6 +308,95 @@ func TestQuantizeModel(t *testing.T) { "output.weight": fsggml.TensorTypeQ8_0, }, }, + { + name: "source_fp8_q8_preserves_bf16_tensors", + kv: map[string]any{ + "general.architecture": "test", + "source_quantization": "hf_fp8", + "source_fp8_tensors": []string{"blk.1.ffn_down_exps.weight"}, + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.1.ffn_down_exps.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "blk.1.attn_q.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + }, + newType: "Q8_0", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.1.ffn_down_exps.weight": fsggml.TensorTypeQ8_0, + "blk.1.attn_q.weight": fsggml.TensorTypeBF16, + }, + }, + { + name: "source_fp8_q4_promotes_bf16_tensors_to_q8", + kv: map[string]any{ + "general.architecture": "test", + "source_quantization": "hf_fp8", + "source_fp8_tensors": []string{ + "blk.1.ffn_gate_exps.weight", + "blk.1.ffn_down_exps.weight", + }, + }, + tensors: []*fsggml.Tensor{ + { + Name: "blk.1.ffn_gate_exps.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "blk.1.ffn_down_exps.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "blk.1.attn_q.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "blk.1.attn_v.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "blk.1.ffn_down.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "blk.1.attn_q_norm.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "blk.1.ffn_gate_inp.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + { + Name: "output.weight", Kind: uint32(fsggml.TensorTypeBF16), + Offset: uint64(0), Shape: []uint64{256, 1}, + WriterTo: bytes.NewReader(quantBytes[fsggml.TensorTypeBF16]), + }, + }, + newType: "Q4_K_M", + expectedTensorTypes: map[string]fsggml.TensorType{ + "blk.1.ffn_gate_exps.weight": fsggml.TensorTypeQ4_K, + "blk.1.ffn_down_exps.weight": fsggml.TensorTypeQ6_K, + "blk.1.attn_q.weight": fsggml.TensorTypeQ8_0, + "blk.1.attn_v.weight": fsggml.TensorTypeQ8_0, + "blk.1.ffn_down.weight": fsggml.TensorTypeQ8_0, + "blk.1.attn_q_norm.weight": fsggml.TensorTypeBF16, + "blk.1.ffn_gate_inp.weight": fsggml.TensorTypeBF16, + "output.weight": fsggml.TensorTypeQ8_0, + }, + }, { name: "f32_short_data", kv: map[string]any{