mirror of
https://github.com/ollama/ollama
synced 2026-04-23 08:45:14 +00:00
llama/compat: disable mmap when load_op transforms text-side tensors
Root-cause fix for the glm-ocr GGML_ASSERT(buft) crash: handle_glmocr fuses ffn_gate + ffn_up into a single ffn_up tensor with ne[1] doubled. The reshape grows ggml_nbytes past what the file's mmap region for the original tensor can back, and the upstream loader's mmap path tries to bind the tensor's storage directly to that (too-small) region. With our maybe_load_text_tensor hook attempting to fill the tensor before that binding, cur->buffer is still null and ggml_backend_buft_is_host(nullptr) asserts. The previous defensive null-buft check (dbba9b170) only papered over the crash — it would silently return false and let upstream proceed with the wrong mmap-backed binding, producing garbage output instead. Real fix: have handlers that transform text-side tensor bytes call `disable_mmap_for(ml)`. translate_metadata then returns true, and the patch site sets `use_mmap = false`. The non-mmap path pre-allocates real backend buffers via ggml_backend_alloc_ctx_tensors_from_buft, after which our load_op overrides land in writable memory. Currently only handle_glmocr needs this; the per-block FFN concat is the sole text-side reshape with a load_op. Other handlers (gemma3, gemma4, qwen35, etc.) only do KV translation or rename-without-resize and remain mmap-compatible. Patch unchanged in line count (78) — the existing translate_metadata call site is rewritten to consume the new bool return. Verification blocked: every prior llama-server process on this box is wedged in macOS uninterruptible-Metal-wait (UE) and survives kill -9, preventing new processes from initializing past Metal discovery. Fix will be live-verified after reboot.
This commit is contained in:
parent
cc7bdf0bcc
commit
0c33775d37
16
llama/compat/llama-ollama-compat-util.cpp
vendored
16
llama/compat/llama-ollama-compat-util.cpp
vendored
|
|
@ -7,6 +7,7 @@
|
|||
#include <cstring>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
namespace llama_ollama_compat::detail {
|
||||
|
||||
|
|
@ -230,6 +231,21 @@ bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name)
|
|||
return false;
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::mutex g_no_mmap_mutex;
|
||||
std::unordered_set<const llama_model_loader *> g_no_mmap;
|
||||
} // anon
|
||||
|
||||
void disable_mmap_for(const llama_model_loader * ml) {
|
||||
std::lock_guard<std::mutex> lk(g_no_mmap_mutex);
|
||||
g_no_mmap.insert(ml);
|
||||
}
|
||||
|
||||
bool is_mmap_disabled_for(const llama_model_loader * ml) {
|
||||
std::lock_guard<std::mutex> lk(g_no_mmap_mutex);
|
||||
return g_no_mmap.count(ml) > 0;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Load-time transform registry
|
||||
// -------------------------------------------------------------------------
|
||||
|
|
|
|||
9
llama/compat/llama-ollama-compat-util.h
vendored
9
llama/compat/llama-ollama-compat-util.h
vendored
|
|
@ -93,6 +93,15 @@ size_t tensor_file_offset(const gguf_context * meta, const char * name);
|
|||
void add_skip_prefix(const llama_model_loader * ml, std::string prefix);
|
||||
bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name);
|
||||
|
||||
// -- Per-loader "needs no-mmap" flag --
|
||||
// Handlers that register a load_op which transforms a TEXT-side tensor's
|
||||
// bytes (e.g. concat reshape) must call disable_mmap_for(ml). With mmap
|
||||
// the upstream loader binds the tensor directly to the file region, so
|
||||
// our load_op has no writable buffer to fill. translate_metadata reads
|
||||
// this flag and returns it back to the patch site.
|
||||
void disable_mmap_for(const llama_model_loader * ml);
|
||||
bool is_mmap_disabled_for(const llama_model_loader * ml);
|
||||
|
||||
// -- Load-time transform registry --
|
||||
struct LoadOp {
|
||||
std::function<bool(const char * src_file, void * dst, size_t dst_size)> apply;
|
||||
|
|
|
|||
12
llama/compat/llama-ollama-compat.cpp
vendored
12
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -610,6 +610,12 @@ void handle_glmocr(const llama_model_loader * ml, gguf_context * meta,
|
|||
|
||||
// Fuse ffn_gate + ffn_up → ffn_up[:, 2*n_ff] for every block, then mark
|
||||
// the orphan ffn_gate tensors as skip so n_tensors lines up.
|
||||
//
|
||||
// The concat reshape grows ne[1] of ffn_up from N to 2N, so the file's
|
||||
// mmap region for the original tensor is too small to back it. Force
|
||||
// the loader off the mmap path so it pre-allocates real backend buffers
|
||||
// that our register_concat_load can fill at load_all_data time.
|
||||
disable_mmap_for(ml);
|
||||
{
|
||||
const int64_t n_blk_kid = gguf_find_key(meta, "glm4.block_count");
|
||||
const uint32_t n_blocks = n_blk_kid >= 0 ? gguf_get_val_u32(meta, n_blk_kid) : 16;
|
||||
|
|
@ -1874,12 +1880,12 @@ void handle_qwen3vl_clip(gguf_context * meta, ggml_context * ctx) {
|
|||
// Public entry points
|
||||
// =========================================================================
|
||||
|
||||
void translate_metadata(const llama_model_loader * ml,
|
||||
bool translate_metadata(const llama_model_loader * ml,
|
||||
gguf_context * meta,
|
||||
ggml_context * ctx,
|
||||
std::string & arch_name,
|
||||
const char * fname) {
|
||||
if (!meta) return;
|
||||
if (!meta) return false;
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
|
||||
g_loader_paths[ml] = fname ? fname : "";
|
||||
|
|
@ -1907,6 +1913,8 @@ void translate_metadata(const llama_model_loader * ml,
|
|||
if (arch_name == "llama4") handle_llama4 (ml, meta, ctx);
|
||||
if (arch_name == "glmocr") handle_glmocr (ml, meta, ctx, arch_name);
|
||||
// Dispatch. Add more arches as they are wired up.
|
||||
|
||||
return is_mmap_disabled_for(ml);
|
||||
}
|
||||
|
||||
void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
|
||||
|
|
|
|||
10
llama/compat/llama-ollama-compat.h
vendored
10
llama/compat/llama-ollama-compat.h
vendored
|
|
@ -37,7 +37,15 @@ namespace llama_ollama_compat {
|
|||
// Called from llama_model_loader's constructor, right after the arch is read.
|
||||
// `fname` is the model file path, captured here so later load-time hooks
|
||||
// (maybe_load_text_tensor) can read raw bytes from it.
|
||||
void translate_metadata(const llama_model_loader * ml,
|
||||
//
|
||||
// Returns true if the caller must disable mmap for this loader. Some
|
||||
// handlers transform tensor data via load_op (e.g. glm-ocr's gate+up
|
||||
// FFN concat), which is incompatible with the default mmap path:
|
||||
// the upstream loader binds tensors directly to the mmap'd file region,
|
||||
// so there's nowhere to write the transformed bytes. Disabling mmap
|
||||
// makes the loader pre-allocate real backend buffers, after which our
|
||||
// load_op overrides land in writable memory.
|
||||
bool translate_metadata(const llama_model_loader * ml,
|
||||
gguf_context * meta,
|
||||
ggml_context * ctx,
|
||||
std::string & arch_name,
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ index 4e65a45a5..75836c683 100644
|
|||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
+ llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str());
|
||||
+ if (llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str())) use_mmap = false;
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
|
|
|
|||
Loading…
Reference in a new issue