llama/compat: disable mmap when load_op transforms text-side tensors

Root-cause fix for the glm-ocr GGML_ASSERT(buft) crash: handle_glmocr
fuses ffn_gate + ffn_up into a single ffn_up tensor with ne[1] doubled.
The reshape grows ggml_nbytes past what the file's mmap region for the
original tensor can back, and the upstream loader's mmap path tries to
bind the tensor's storage directly to that (too-small) region. With our
maybe_load_text_tensor hook attempting to fill the tensor before that
binding, cur->buffer is still null and ggml_backend_buft_is_host(nullptr)
asserts.

The previous defensive null-buft check (dbba9b170) only papered over the
crash — it would silently return false and let upstream proceed with the
wrong mmap-backed binding, producing garbage output instead.

Real fix: have handlers that transform text-side tensor bytes call
`disable_mmap_for(ml)`. translate_metadata then returns true, and the
patch site sets `use_mmap = false`. The non-mmap path pre-allocates real
backend buffers via ggml_backend_alloc_ctx_tensors_from_buft, after
which our load_op overrides land in writable memory.

Currently only handle_glmocr needs this; the per-block FFN concat is the
sole text-side reshape with a load_op. Other handlers (gemma3, gemma4,
qwen35, etc.) only do KV translation or rename-without-resize and remain
mmap-compatible.

Patch unchanged in line count (78) — the existing translate_metadata
call site is rewritten to consume the new bool return.

Verification blocked: every prior llama-server process on this box is
wedged in macOS uninterruptible-Metal-wait (UE) and survives kill -9,
preventing new processes from initializing past Metal discovery. Fix
will be live-verified after reboot.
This commit is contained in:
jmorganca 2026-04-19 22:18:23 -07:00
parent cc7bdf0bcc
commit 0c33775d37
5 changed files with 45 additions and 4 deletions

View file

@ -7,6 +7,7 @@
#include <cstring>
#include <mutex>
#include <unordered_map>
#include <unordered_set>
namespace llama_ollama_compat::detail {
@ -230,6 +231,21 @@ bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name)
return false;
}
namespace {
std::mutex g_no_mmap_mutex;
std::unordered_set<const llama_model_loader *> g_no_mmap;
} // anon
void disable_mmap_for(const llama_model_loader * ml) {
std::lock_guard<std::mutex> lk(g_no_mmap_mutex);
g_no_mmap.insert(ml);
}
bool is_mmap_disabled_for(const llama_model_loader * ml) {
std::lock_guard<std::mutex> lk(g_no_mmap_mutex);
return g_no_mmap.count(ml) > 0;
}
// -------------------------------------------------------------------------
// Load-time transform registry
// -------------------------------------------------------------------------

View file

@ -93,6 +93,15 @@ size_t tensor_file_offset(const gguf_context * meta, const char * name);
void add_skip_prefix(const llama_model_loader * ml, std::string prefix);
bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name);
// -- Per-loader "needs no-mmap" flag --
// Handlers that register a load_op which transforms a TEXT-side tensor's
// bytes (e.g. concat reshape) must call disable_mmap_for(ml). With mmap
// the upstream loader binds the tensor directly to the file region, so
// our load_op has no writable buffer to fill. translate_metadata reads
// this flag and returns it back to the patch site.
void disable_mmap_for(const llama_model_loader * ml);
bool is_mmap_disabled_for(const llama_model_loader * ml);
// -- Load-time transform registry --
struct LoadOp {
std::function<bool(const char * src_file, void * dst, size_t dst_size)> apply;

View file

@ -610,6 +610,12 @@ void handle_glmocr(const llama_model_loader * ml, gguf_context * meta,
// Fuse ffn_gate + ffn_up → ffn_up[:, 2*n_ff] for every block, then mark
// the orphan ffn_gate tensors as skip so n_tensors lines up.
//
// The concat reshape grows ne[1] of ffn_up from N to 2N, so the file's
// mmap region for the original tensor is too small to back it. Force
// the loader off the mmap path so it pre-allocates real backend buffers
// that our register_concat_load can fill at load_all_data time.
disable_mmap_for(ml);
{
const int64_t n_blk_kid = gguf_find_key(meta, "glm4.block_count");
const uint32_t n_blocks = n_blk_kid >= 0 ? gguf_get_val_u32(meta, n_blk_kid) : 16;
@ -1874,12 +1880,12 @@ void handle_qwen3vl_clip(gguf_context * meta, ggml_context * ctx) {
// Public entry points
// =========================================================================
void translate_metadata(const llama_model_loader * ml,
bool translate_metadata(const llama_model_loader * ml,
gguf_context * meta,
ggml_context * ctx,
std::string & arch_name,
const char * fname) {
if (!meta) return;
if (!meta) return false;
{
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
g_loader_paths[ml] = fname ? fname : "";
@ -1907,6 +1913,8 @@ void translate_metadata(const llama_model_loader * ml,
if (arch_name == "llama4") handle_llama4 (ml, meta, ctx);
if (arch_name == "glmocr") handle_glmocr (ml, meta, ctx, arch_name);
// Dispatch. Add more arches as they are wired up.
return is_mmap_disabled_for(ml);
}
void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {

View file

@ -37,7 +37,15 @@ namespace llama_ollama_compat {
// Called from llama_model_loader's constructor, right after the arch is read.
// `fname` is the model file path, captured here so later load-time hooks
// (maybe_load_text_tensor) can read raw bytes from it.
void translate_metadata(const llama_model_loader * ml,
//
// Returns true if the caller must disable mmap for this loader. Some
// handlers transform tensor data via load_op (e.g. glm-ocr's gate+up
// FFN concat), which is incompatible with the default mmap path:
// the upstream loader binds tensors directly to the mmap'd file region,
// so there's nowhere to write the transformed bytes. Disabling mmap
// makes the loader pre-allocate real backend buffers, after which our
// load_op overrides land in writable memory.
bool translate_metadata(const llama_model_loader * ml,
gguf_context * meta,
ggml_context * ctx,
std::string & arch_name,

View file

@ -14,7 +14,7 @@ index 4e65a45a5..75836c683 100644
}
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str());
+ if (llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str())) use_mmap = false;
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));