llama/compat: disable mmap when load_op transforms text-side tensors

Root-cause fix for the glm-ocr GGML_ASSERT(buft) crash: handle_glmocr fuses ffn_gate + ffn_up into a single ffn_up tensor with ne[1] doubled. The reshape grows ggml_nbytes past what the file's mmap region for the original tensor can back, and the upstream loader's mmap path tries to bind the tensor's storage directly to that (too-small) region. With our maybe_load_text_tensor hook attempting to fill the tensor before that binding, cur->buffer is still null and ggml_backend_buft_is_host(nullptr) asserts. The previous defensive null-buft check (dbba9b170) only papered over the crash — it would silently return false and let upstream proceed with the wrong mmap-backed binding, producing garbage output instead. Real fix: have handlers that transform text-side tensor bytes call `disable_mmap_for(ml)`. translate_metadata then returns true, and the patch site sets `use_mmap = false`. The non-mmap path pre-allocates real backend buffers via ggml_backend_alloc_ctx_tensors_from_buft, after which our load_op overrides land in writable memory. Currently only handle_glmocr needs this; the per-block FFN concat is the sole text-side reshape with a load_op. Other handlers (gemma3, gemma4, qwen35, etc.) only do KV translation or rename-without-resize and remain mmap-compatible. Patch unchanged in line count (78) — the existing translate_metadata call site is rewritten to consume the new bool return. Verification blocked: every prior llama-server process on this box is wedged in macOS uninterruptible-Metal-wait (UE) and survives kill -9, preventing new processes from initializing past Metal discovery. Fix will be live-verified after reboot.
2026-04-23 08:45:14 +00:00 · 2026-04-19 22:18:23 -07:00 · 2026-04-19 22:18:23 -07:00 · 0c33775d37
parent cc7bdf0bcc
commit 0c33775d37
5 changed files with 45 additions and 4 deletions
--- a/llama/compat/llama-ollama-compat-util.cpp
+++ b/llama/compat/llama-ollama-compat-util.cpp
@ -7,6 +7,7 @@
 #include <cstring>
 #include <mutex>
 #include <unordered_map>
+#include <unordered_set>

 namespace llama_ollama_compat::detail {

@ -230,6 +231,21 @@ bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name)
    return false;
 }

+namespace {
+std::mutex g_no_mmap_mutex;
+std::unordered_set<const llama_model_loader *> g_no_mmap;
+} // anon
+
+void disable_mmap_for(const llama_model_loader * ml) {
+    std::lock_guard<std::mutex> lk(g_no_mmap_mutex);
+    g_no_mmap.insert(ml);
+}
+
+bool is_mmap_disabled_for(const llama_model_loader * ml) {
+    std::lock_guard<std::mutex> lk(g_no_mmap_mutex);
+    return g_no_mmap.count(ml) > 0;
+}
+
 // -------------------------------------------------------------------------
 // Load-time transform registry
 // -------------------------------------------------------------------------
--- a/llama/compat/llama-ollama-compat-util.h
+++ b/llama/compat/llama-ollama-compat-util.h
@ -93,6 +93,15 @@ size_t tensor_file_offset(const gguf_context * meta, const char * name);
 void add_skip_prefix(const llama_model_loader * ml, std::string prefix);
 bool should_skip_tensor_prefix(const llama_model_loader * ml, const char * name);

+// -- Per-loader "needs no-mmap" flag --
+// Handlers that register a load_op which transforms a TEXT-side tensor's
+// bytes (e.g. concat reshape) must call disable_mmap_for(ml). With mmap
+// the upstream loader binds the tensor directly to the file region, so
+// our load_op has no writable buffer to fill. translate_metadata reads
+// this flag and returns it back to the patch site.
+void disable_mmap_for(const llama_model_loader * ml);
+bool is_mmap_disabled_for(const llama_model_loader * ml);
+
 // -- Load-time transform registry --
 struct LoadOp {
    std::function<bool(const char * src_file, void * dst, size_t dst_size)> apply;
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -610,6 +610,12 @@ void handle_glmocr(const llama_model_loader * ml, gguf_context * meta,

    // Fuse ffn_gate + ffn_up → ffn_up[:, 2*n_ff] for every block, then mark
    // the orphan ffn_gate tensors as skip so n_tensors lines up.
+    //
+    // The concat reshape grows ne[1] of ffn_up from N to 2N, so the file's
+    // mmap region for the original tensor is too small to back it. Force
+    // the loader off the mmap path so it pre-allocates real backend buffers
+    // that our register_concat_load can fill at load_all_data time.
+    disable_mmap_for(ml);
    {
        const int64_t n_blk_kid = gguf_find_key(meta, "glm4.block_count");
        const uint32_t n_blocks = n_blk_kid >= 0 ? gguf_get_val_u32(meta, n_blk_kid) : 16;
@ -1874,12 +1880,12 @@ void handle_qwen3vl_clip(gguf_context * meta, ggml_context * ctx) {
 // Public entry points
 // =========================================================================

-void translate_metadata(const llama_model_loader * ml,
+bool translate_metadata(const llama_model_loader * ml,
                        gguf_context * meta,
                        ggml_context * ctx,
                        std::string & arch_name,
                        const char * fname) {
-    if (!meta) return;
+    if (!meta) return false;
    {
        std::lock_guard<std::mutex> lk(g_loader_path_mutex);
        g_loader_paths[ml] = fname ? fname : "";
@ -1907,6 +1913,8 @@ void translate_metadata(const llama_model_loader * ml,
    if (arch_name == "llama4")        handle_llama4        (ml, meta, ctx);
    if (arch_name == "glmocr")        handle_glmocr        (ml, meta, ctx, arch_name);
    // Dispatch. Add more arches as they are wired up.
+
+    return is_mmap_disabled_for(ml);
 }

 void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
--- a/llama/compat/llama-ollama-compat.h
+++ b/llama/compat/llama-ollama-compat.h
@ -37,7 +37,15 @@ namespace llama_ollama_compat {
 // Called from llama_model_loader's constructor, right after the arch is read.
 // `fname` is the model file path, captured here so later load-time hooks
 // (maybe_load_text_tensor) can read raw bytes from it.
-void translate_metadata(const llama_model_loader * ml,
+//
+// Returns true if the caller must disable mmap for this loader. Some
+// handlers transform tensor data via load_op (e.g. glm-ocr's gate+up
+// FFN concat), which is incompatible with the default mmap path:
+// the upstream loader binds tensors directly to the mmap'd file region,
+// so there's nowhere to write the transformed bytes. Disabling mmap
+// makes the loader pre-allocate real backend buffers, after which our
+// load_op overrides land in writable memory.
+bool translate_metadata(const llama_model_loader * ml,
                        gguf_context * meta,
                        ggml_context * ctx,
                        std::string & arch_name,
--- a/llama/compat/upstream-edits.patch
+++ b/llama/compat/upstream-edits.patch
@ -14,7 +14,7 @@ index 4e65a45a5..75836c683 100644
         }

         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
-+        llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str());
+        if (llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name, fname.c_str())) use_mmap = false;
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
         files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));