From 227e1396deeaa44336475d1af0a0753263aba454 Mon Sep 17 00:00:00 2001
From: Matt Clayton <matt@lmstudio.ai>
Date: Wed, 7 May 2025 17:54:00 -0400
Subject: [PATCH 1/3] mtmd: Expose helper_decode_image, output_embd_copy,
 image_tokens_copy/free

---
 tools/mtmd/mtmd.cpp | 157 ++++++++++++++++++++++++++++----------------
 tools/mtmd/mtmd.h   |  30 +++++++--
 2 files changed, 124 insertions(+), 63 deletions(-)

diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index b600e4341375f..41d87740949f0 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -167,7 +167,7 @@ struct mtmd_image_tokens {
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
 
-    mtmd_image_tokens clone() {
+    mtmd_image_tokens clone() const {
         return mtmd_image_tokens{
             nx,
             ny,
@@ -409,12 +409,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     return 0;
 }
 
-static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
-    if (image_tokens) {
-        delete image_tokens;
-    }
-}
-
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -454,6 +448,23 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
+float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out) {
+    if (ctx->image_embd_v.empty()) {
+        *n_embd_out = 0;
+        return NULL;
+    }
+
+    *n_embd_out = ctx->image_embd_v.size();
+    float * copy = (float *) malloc(*n_embd_out * sizeof(float));
+    if (copy == NULL) {
+        *n_embd_out = 0;
+        return NULL;
+    }
+
+    memcpy(copy, ctx->image_embd_v.data(), ctx->image_embd_v.size() * sizeof(float));
+    return copy;
+}
+
 size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
     size_t n_tokens = 0;
     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
@@ -580,6 +591,69 @@ struct decode_embd_batch {
     }
 };
 
+// Helper function for decoding an image whose embeddings have already been calculated
+int32_t mtmd_helper_decode_image(
+        mtmd_context * ctx,
+        struct llama_context * lctx,
+        const mtmd_image_tokens * image_tokens,
+        float * embd,
+        llama_pos n_past,
+        llama_seq_id seq_id,
+        int32_t n_batch,
+        llama_pos * new_n_past) {
+    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
+
+    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+    int32_t i_batch = 0;
+    int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
+    decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+
+    const int nx = mtmd_image_tokens_get_nx(image_tokens);
+    const int ny = mtmd_image_tokens_get_ny(image_tokens);
+
+    if (mtmd_decode_use_mrope(ctx)) {
+        batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
+    } else {
+        batch_embd.set_position_normal(n_past, seq_id);
+    }
+
+    if (mtmd_decode_use_non_causal(ctx)) {
+        llama_set_causal_attn(lctx, false);
+        // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
+    }
+
+    while (i_batch < n_img_batches) { // split into batches
+        int pos_offset = i_batch*n_batch;
+        int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+        llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
+
+        LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+
+        int64_t t1 = ggml_time_ms();
+        int32_t ret = llama_decode(lctx, batch_embd_view);
+        if (ret != 0) {
+            LOG_ERR("failed to decode image\n");
+            llama_set_causal_attn(lctx, true); // restore causal attn
+            return ret;
+        }
+
+        if (ctx->print_timings) {
+            LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
+        }
+
+        i_batch++;
+    }
+
+    n_past += mtmd_image_tokens_get_n_pos(image_tokens);
+    *new_n_past = n_past;
+
+    if (mtmd_decode_use_non_causal(ctx)) {
+        llama_set_causal_attn(lctx, true);
+    }
+    return 0;
+}
+
 int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         struct llama_context * lctx,
         const mtmd_input_chunk * chunk,
@@ -591,8 +665,6 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
     int32_t ret;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
     auto chunk_type = mtmd_input_chunk_get_type(chunk);
-    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
-    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
 
     if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
         size_t n_tokens;
@@ -637,57 +709,13 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         if (ctx->print_timings) {
             LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
         }
-
-        int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
-        int32_t i_batch = 0;
-        int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
         float * embd = mtmd_get_output_embd(ctx);
-        decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
-
-        const int nx = mtmd_image_tokens_get_nx(image_tokens);
-        const int ny = mtmd_image_tokens_get_ny(image_tokens);
-
-        if (mtmd_decode_use_mrope(ctx)) {
-            batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
-        } else {
-            batch_embd.set_position_normal(n_past, seq_id);
-        }
-
-        if (mtmd_decode_use_non_causal(ctx)) {
-            llama_set_causal_attn(lctx, false);
-            // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
-        }
-
-        while (i_batch < n_img_batches) { // split into batches
-            int pos_offset = i_batch*n_batch;
-            int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
-            llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
-
-            LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
-
-            int64_t t1 = ggml_time_ms();
-            ret = llama_decode(lctx, batch_embd_view);
-            if (ret != 0) {
-                LOG_ERR("failed to decode image\n");
-                llama_set_causal_attn(lctx, true); // restore causal attn
-                llama_batch_free(text_batch);
-                return ret;
-            }
-
-            if (ctx->print_timings) {
-                LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
-            }
-
-            i_batch++;
-        }
-
-        n_past += mtmd_image_tokens_get_n_pos(image_tokens);
-        *new_n_past = n_past;
-
-        if (mtmd_decode_use_non_causal(ctx)) {
-            llama_set_causal_attn(lctx, true);
+        ret = mtmd_helper_decode_image(ctx, lctx, image_tokens, embd, n_past, seq_id, n_batch, new_n_past);
+        if (ret != 0) {
+            LOG_ERR("failed to decode image\n");
+            llama_batch_free(text_batch);
+            return ret;
         }
-
     } else {
         GGML_ABORT("chunk type not supported");
     }
@@ -903,6 +931,19 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     return image_tokens->n_tokens();
 }
 
+void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+    if (image_tokens) {
+        delete image_tokens;
+    }
+}
+
+mtmd_image_tokens * mtmd_image_tokens_copy(const mtmd_image_tokens * image_tokens) {
+    if (!image_tokens) {
+        return nullptr;
+    }
+    return new mtmd_image_tokens(image_tokens->clone());
+}
+
 // test function
 
 mtmd_input_chunks * mtmd_test_create_input_chunks() {
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index e2f76e2e8d346..eb76db8e26d11 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -143,12 +143,14 @@ MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 //
 // the instance will be constructed via mtmd_tokenize()
 // it will be freed along with mtmd_input_chunk
-MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
-MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
-MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
-MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t              mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t              mtmd_image_tokens_get_nx       (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t              mtmd_image_tokens_get_ny       (const mtmd_image_tokens * image_tokens);
+MTMD_API const char *        mtmd_image_tokens_get_id       (const mtmd_image_tokens * image_tokens);
 // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
-MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos           mtmd_image_tokens_get_n_pos    (const mtmd_image_tokens * image_tokens);
+MTMD_API mtmd_image_tokens * mtmd_image_tokens_copy         (const mtmd_image_tokens * image_tokens);
+MTMD_API void                mtmd_image_tokens_free         (mtmd_image_tokens * image_tokens);
 
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
@@ -178,6 +180,9 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
+// returns a copy of output embeddings from the last encode pass, of size n_embd_out
+MTMD_API float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out);
+
 /////////////////////////////////////////
 
 //
@@ -231,6 +236,16 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                                bool logits_last,
                                                llama_pos * new_n_past);
 
+// helper function to decode an image whose embeddings have already been calculated
+MTMD_API int32_t mtmd_helper_decode_image(mtmd_context *ctx,
+                                          struct llama_context *lctx,
+                                          const mtmd_image_tokens *image_tokens,
+                                          float *embd,
+                                          llama_pos n_past,
+                                          llama_seq_id seq_id,
+                                          int32_t n_batch,
+                                          llama_pos *new_n_past);
+
 /////////////////////////////////////////
 
 // test function, to be used in test-mtmd-c-api.c
@@ -268,6 +283,11 @@ struct mtmd_input_chunk_deleter {
 };
 using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
 
+struct mtmd_image_tokens_deleter {
+    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
+};
+using image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+
 struct bitmap {
     bitmap_ptr ptr;
     bitmap() : ptr(nullptr) {}

From 816a37520f80d3b965b8f17f2cdb00a1e9d79c33 Mon Sep 17 00:00:00 2001
From: Matt Clayton <matt@lmstudio.ai>
Date: Thu, 8 May 2025 12:44:23 -0400
Subject: [PATCH 2/3] Slim down

---
 tools/mtmd/mtmd.cpp | 55 +++++++++++++++++----------------------------
 tools/mtmd/mtmd.h   | 37 ++++++++++++------------------
 2 files changed, 35 insertions(+), 57 deletions(-)

diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 41d87740949f0..78a0e1db8e1f1 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -167,7 +167,7 @@ struct mtmd_image_tokens {
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
 
-    mtmd_image_tokens clone() const {
+    mtmd_image_tokens clone() {
         return mtmd_image_tokens{
             nx,
             ny,
@@ -409,6 +409,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     return 0;
 }
 
+static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
+    if (image_tokens) {
+        delete image_tokens;
+    }
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -448,23 +454,6 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out) {
-    if (ctx->image_embd_v.empty()) {
-        *n_embd_out = 0;
-        return NULL;
-    }
-
-    *n_embd_out = ctx->image_embd_v.size();
-    float * copy = (float *) malloc(*n_embd_out * sizeof(float));
-    if (copy == NULL) {
-        *n_embd_out = 0;
-        return NULL;
-    }
-
-    memcpy(copy, ctx->image_embd_v.data(), ctx->image_embd_v.size() * sizeof(float));
-    return copy;
-}
-
 size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
     size_t n_tokens = 0;
     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
@@ -592,15 +581,26 @@ struct decode_embd_batch {
 };
 
 // Helper function for decoding an image whose embeddings have already been calculated
-int32_t mtmd_helper_decode_image(
+int32_t mtmd_helper_decode_image_chunk(
         mtmd_context * ctx,
         struct llama_context * lctx,
-        const mtmd_image_tokens * image_tokens,
+        const mtmd_input_chunk * chunk,
         float * embd,
         llama_pos n_past,
         llama_seq_id seq_id,
         int32_t n_batch,
         llama_pos * new_n_past) {
+
+    if (chunk->type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
+        return -1;
+    }
+    if (!chunk->tokens_image) {
+        LOG_ERR("failed to decode image chunk: image tokens are null\n");
+        return -1;
+    }
+    const auto image_tokens = chunk->tokens_image.get();
+
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
 
@@ -710,7 +710,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
             LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
         }
         float * embd = mtmd_get_output_embd(ctx);
-        ret = mtmd_helper_decode_image(ctx, lctx, image_tokens, embd, n_past, seq_id, n_batch, new_n_past);
+        ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
         if (ret != 0) {
             LOG_ERR("failed to decode image\n");
             llama_batch_free(text_batch);
@@ -931,19 +931,6 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
     return image_tokens->n_tokens();
 }
 
-void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
-    if (image_tokens) {
-        delete image_tokens;
-    }
-}
-
-mtmd_image_tokens * mtmd_image_tokens_copy(const mtmd_image_tokens * image_tokens) {
-    if (!image_tokens) {
-        return nullptr;
-    }
-    return new mtmd_image_tokens(image_tokens->clone());
-}
-
 // test function
 
 mtmd_input_chunks * mtmd_test_create_input_chunks() {
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index eb76db8e26d11..8b0cf884573b1 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -143,14 +143,12 @@ MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 //
 // the instance will be constructed via mtmd_tokenize()
 // it will be freed along with mtmd_input_chunk
-MTMD_API size_t              mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens);
-MTMD_API size_t              mtmd_image_tokens_get_nx       (const mtmd_image_tokens * image_tokens);
-MTMD_API size_t              mtmd_image_tokens_get_ny       (const mtmd_image_tokens * image_tokens);
-MTMD_API const char *        mtmd_image_tokens_get_id       (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens);
 // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
-MTMD_API llama_pos           mtmd_image_tokens_get_n_pos    (const mtmd_image_tokens * image_tokens);
-MTMD_API mtmd_image_tokens * mtmd_image_tokens_copy         (const mtmd_image_tokens * image_tokens);
-MTMD_API void                mtmd_image_tokens_free         (mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens);
 
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
@@ -180,9 +178,6 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
-// returns a copy of output embeddings from the last encode pass, of size n_embd_out
-MTMD_API float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out);
-
 /////////////////////////////////////////
 
 //
@@ -237,14 +232,15 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                                llama_pos * new_n_past);
 
 // helper function to decode an image whose embeddings have already been calculated
-MTMD_API int32_t mtmd_helper_decode_image(mtmd_context *ctx,
-                                          struct llama_context *lctx,
-                                          const mtmd_image_tokens *image_tokens,
-                                          float *embd,
-                                          llama_pos n_past,
-                                          llama_seq_id seq_id,
-                                          int32_t n_batch,
-                                          llama_pos *new_n_past);
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context *ctx,
+                                                struct llama_context *lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float *embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos *new_n_past);
 
 /////////////////////////////////////////
 
@@ -283,11 +279,6 @@ struct mtmd_input_chunk_deleter {
 };
 using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
 
-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); }
-};
-using image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
-
 struct bitmap {
     bitmap_ptr ptr;
     bitmap() : ptr(nullptr) {}

From 4d17bfc0beb1fa35f25628e6ee00e32953839a7d Mon Sep 17 00:00:00 2001
From: Matt Clayton <matt@lmstudio.ai>
Date: Thu, 8 May 2025 13:16:18 -0400
Subject: [PATCH 3/3] Cleanups

---
 tools/mtmd/mtmd.cpp | 11 +++++------
 tools/mtmd/mtmd.h   |  9 +++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 78a0e1db8e1f1..5d18e8929b31f 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -585,21 +585,20 @@ int32_t mtmd_helper_decode_image_chunk(
         mtmd_context * ctx,
         struct llama_context * lctx,
         const mtmd_input_chunk * chunk,
-        float * embd,
+        float * encoded_embd,
         llama_pos n_past,
         llama_seq_id seq_id,
         int32_t n_batch,
         llama_pos * new_n_past) {
-
-    if (chunk->type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+    if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
         LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
         return -1;
     }
-    if (!chunk->tokens_image) {
+    const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+    if (!image_tokens) {
         LOG_ERR("failed to decode image chunk: image tokens are null\n");
         return -1;
     }
-    const auto image_tokens = chunk->tokens_image.get();
 
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
@@ -607,7 +606,7 @@ int32_t mtmd_helper_decode_image_chunk(
     int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
     int32_t i_batch = 0;
     int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
-    decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
 
     const int nx = mtmd_image_tokens_get_nx(image_tokens);
     const int ny = mtmd_image_tokens_get_ny(image_tokens);
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 8b0cf884573b1..54cf481b6aa94 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -232,15 +232,16 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                                llama_pos * new_n_past);
 
 // helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
 // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
-MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context *ctx,
-                                                struct llama_context *lctx,
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+                                                struct llama_context * lctx,
                                                 const mtmd_input_chunk * chunk,
-                                                float *embd,
+                                                float * encoded_embd,
                                                 llama_pos n_past,
                                                 llama_seq_id seq_id,
                                                 int32_t n_batch,
-                                                llama_pos *new_n_past);
+                                                llama_pos * new_n_past);
 
 /////////////////////////////////////////