From 227e1396deeaa44336475d1af0a0753263aba454 Mon Sep 17 00:00:00 2001 From: Matt Clayton Date: Wed, 7 May 2025 17:54:00 -0400 Subject: [PATCH 1/3] mtmd: Expose helper_decode_image, output_embd_copy, image_tokens_copy/free --- tools/mtmd/mtmd.cpp | 157 ++++++++++++++++++++++++++++---------------- tools/mtmd/mtmd.h | 30 +++++++-- 2 files changed, 124 insertions(+), 63 deletions(-) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b600e4341375f..41d87740949f0 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -167,7 +167,7 @@ struct mtmd_image_tokens { clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking - mtmd_image_tokens clone() { + mtmd_image_tokens clone() const { return mtmd_image_tokens{ nx, ny, @@ -409,12 +409,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx, return 0; } -static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) { - if (image_tokens) { - delete image_tokens; - } -} - int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); @@ -454,6 +448,23 @@ float * mtmd_get_output_embd(mtmd_context * ctx) { return ctx->image_embd_v.data(); } +float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out) { + if (ctx->image_embd_v.empty()) { + *n_embd_out = 0; + return NULL; + } + + *n_embd_out = ctx->image_embd_v.size(); + float * copy = (float *) malloc(*n_embd_out * sizeof(float)); + if (copy == NULL) { + *n_embd_out = 0; + return NULL; + } + + memcpy(copy, ctx->image_embd_v.data(), ctx->image_embd_v.size() * sizeof(float)); + return copy; +} + size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) { size_t n_tokens = 0; for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { @@ -580,6 +591,69 @@ struct decode_embd_batch { } }; +// Helper function for decoding an image whose embeddings have already been calculated +int32_t mtmd_helper_decode_image( + mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_image_tokens * image_tokens, + float * embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos * new_n_past) { + int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); + int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; + + int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens); + int32_t i_batch = 0; + int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; + decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd); + + const int nx = mtmd_image_tokens_get_nx(image_tokens); + const int ny = mtmd_image_tokens_get_ny(image_tokens); + + if (mtmd_decode_use_mrope(ctx)) { + batch_embd.set_position_mrope(n_past, nx, ny, seq_id); + } else { + batch_embd.set_position_normal(n_past, seq_id); + } + + if (mtmd_decode_use_non_causal(ctx)) { + llama_set_causal_attn(lctx, false); + // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image + } + + while (i_batch < n_img_batches) { // split into batches + int pos_offset = i_batch*n_batch; + int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); + llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch); + + LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch); + + int64_t t1 = ggml_time_ms(); + int32_t ret = llama_decode(lctx, batch_embd_view); + if (ret != 0) { + LOG_ERR("failed to decode image\n"); + llama_set_causal_attn(lctx, true); // restore causal attn + return ret; + } + + if (ctx->print_timings) { + LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1); + } + + i_batch++; + } + + n_past += mtmd_image_tokens_get_n_pos(image_tokens); + *new_n_past = n_past; + + if (mtmd_decode_use_non_causal(ctx)) { + llama_set_causal_attn(lctx, true); + } + return 0; +} + int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, struct llama_context * lctx, const mtmd_input_chunk * chunk, @@ -591,8 +665,6 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, int32_t ret; llama_batch text_batch = llama_batch_init(n_batch, 0, 1); auto chunk_type = mtmd_input_chunk_get_type(chunk); - int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); - int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { size_t n_tokens; @@ -637,57 +709,13 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, if (ctx->print_timings) { LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); } - - int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens); - int32_t i_batch = 0; - int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; float * embd = mtmd_get_output_embd(ctx); - decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd); - - const int nx = mtmd_image_tokens_get_nx(image_tokens); - const int ny = mtmd_image_tokens_get_ny(image_tokens); - - if (mtmd_decode_use_mrope(ctx)) { - batch_embd.set_position_mrope(n_past, nx, ny, seq_id); - } else { - batch_embd.set_position_normal(n_past, seq_id); - } - - if (mtmd_decode_use_non_causal(ctx)) { - llama_set_causal_attn(lctx, false); - // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image - } - - while (i_batch < n_img_batches) { // split into batches - int pos_offset = i_batch*n_batch; - int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); - llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch); - - LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch); - - int64_t t1 = ggml_time_ms(); - ret = llama_decode(lctx, batch_embd_view); - if (ret != 0) { - LOG_ERR("failed to decode image\n"); - llama_set_causal_attn(lctx, true); // restore causal attn - llama_batch_free(text_batch); - return ret; - } - - if (ctx->print_timings) { - LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1); - } - - i_batch++; - } - - n_past += mtmd_image_tokens_get_n_pos(image_tokens); - *new_n_past = n_past; - - if (mtmd_decode_use_non_causal(ctx)) { - llama_set_causal_attn(lctx, true); + ret = mtmd_helper_decode_image(ctx, lctx, image_tokens, embd, n_past, seq_id, n_batch, new_n_past); + if (ret != 0) { + LOG_ERR("failed to decode image\n"); + llama_batch_free(text_batch); + return ret; } - } else { GGML_ABORT("chunk type not supported"); } @@ -903,6 +931,19 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { return image_tokens->n_tokens(); } +void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) { + if (image_tokens) { + delete image_tokens; + } +} + +mtmd_image_tokens * mtmd_image_tokens_copy(const mtmd_image_tokens * image_tokens) { + if (!image_tokens) { + return nullptr; + } + return new mtmd_image_tokens(image_tokens->clone()); +} + // test function mtmd_input_chunks * mtmd_test_create_input_chunks() { diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index e2f76e2e8d346..eb76db8e26d11 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -143,12 +143,14 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); // // the instance will be constructed via mtmd_tokenize() // it will be freed along with mtmd_input_chunk -MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); -MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); +MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise) -MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); +MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); +MTMD_API mtmd_image_tokens * mtmd_image_tokens_copy (const mtmd_image_tokens * image_tokens); +MTMD_API void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens); // tokenize an input text prompt and an image // the prompt must have the input image marker (default: "<__image__>") in it @@ -178,6 +180,9 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx, // get output embeddings from the last encode pass MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); +// returns a copy of output embeddings from the last encode pass, of size n_embd_out +MTMD_API float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out); + ///////////////////////////////////////// // @@ -231,6 +236,16 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, bool logits_last, llama_pos * new_n_past); +// helper function to decode an image whose embeddings have already been calculated +MTMD_API int32_t mtmd_helper_decode_image(mtmd_context *ctx, + struct llama_context *lctx, + const mtmd_image_tokens *image_tokens, + float *embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos *new_n_past); + ///////////////////////////////////////// // test function, to be used in test-mtmd-c-api.c @@ -268,6 +283,11 @@ struct mtmd_input_chunk_deleter { }; using input_chunk_ptr = std::unique_ptr; +struct mtmd_image_tokens_deleter { + void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); } +}; +using image_tokens_ptr = std::unique_ptr; + struct bitmap { bitmap_ptr ptr; bitmap() : ptr(nullptr) {} From 816a37520f80d3b965b8f17f2cdb00a1e9d79c33 Mon Sep 17 00:00:00 2001 From: Matt Clayton Date: Thu, 8 May 2025 12:44:23 -0400 Subject: [PATCH 2/3] Slim down --- tools/mtmd/mtmd.cpp | 55 +++++++++++++++++---------------------------- tools/mtmd/mtmd.h | 37 ++++++++++++------------------ 2 files changed, 35 insertions(+), 57 deletions(-) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 41d87740949f0..78a0e1db8e1f1 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -167,7 +167,7 @@ struct mtmd_image_tokens { clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking - mtmd_image_tokens clone() const { + mtmd_image_tokens clone() { return mtmd_image_tokens{ nx, ny, @@ -409,6 +409,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx, return 0; } +static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) { + if (image_tokens) { + delete image_tokens; + } +} + int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); @@ -448,23 +454,6 @@ float * mtmd_get_output_embd(mtmd_context * ctx) { return ctx->image_embd_v.data(); } -float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out) { - if (ctx->image_embd_v.empty()) { - *n_embd_out = 0; - return NULL; - } - - *n_embd_out = ctx->image_embd_v.size(); - float * copy = (float *) malloc(*n_embd_out * sizeof(float)); - if (copy == NULL) { - *n_embd_out = 0; - return NULL; - } - - memcpy(copy, ctx->image_embd_v.data(), ctx->image_embd_v.size() * sizeof(float)); - return copy; -} - size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) { size_t n_tokens = 0; for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { @@ -592,15 +581,26 @@ struct decode_embd_batch { }; // Helper function for decoding an image whose embeddings have already been calculated -int32_t mtmd_helper_decode_image( +int32_t mtmd_helper_decode_image_chunk( mtmd_context * ctx, struct llama_context * lctx, - const mtmd_image_tokens * image_tokens, + const mtmd_input_chunk * chunk, float * embd, llama_pos n_past, llama_seq_id seq_id, int32_t n_batch, llama_pos * new_n_past) { + + if (chunk->type != MTMD_INPUT_CHUNK_TYPE_IMAGE) { + LOG_ERR("failed to decode image chunk: input chunk not of image type\n"); + return -1; + } + if (!chunk->tokens_image) { + LOG_ERR("failed to decode image chunk: image tokens are null\n"); + return -1; + } + const auto image_tokens = chunk->tokens_image.get(); + int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; @@ -710,7 +710,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); } float * embd = mtmd_get_output_embd(ctx); - ret = mtmd_helper_decode_image(ctx, lctx, image_tokens, embd, n_past, seq_id, n_batch, new_n_past); + ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past); if (ret != 0) { LOG_ERR("failed to decode image\n"); llama_batch_free(text_batch); @@ -931,19 +931,6 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { return image_tokens->n_tokens(); } -void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) { - if (image_tokens) { - delete image_tokens; - } -} - -mtmd_image_tokens * mtmd_image_tokens_copy(const mtmd_image_tokens * image_tokens) { - if (!image_tokens) { - return nullptr; - } - return new mtmd_image_tokens(image_tokens->clone()); -} - // test function mtmd_input_chunks * mtmd_test_create_input_chunks() { diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index eb76db8e26d11..8b0cf884573b1 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -143,14 +143,12 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); // // the instance will be constructed via mtmd_tokenize() // it will be freed along with mtmd_input_chunk -MTMD_API size_t mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); -MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); +MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise) -MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); -MTMD_API mtmd_image_tokens * mtmd_image_tokens_copy (const mtmd_image_tokens * image_tokens); -MTMD_API void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens); +MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // tokenize an input text prompt and an image // the prompt must have the input image marker (default: "<__image__>") in it @@ -180,9 +178,6 @@ MTMD_API int32_t mtmd_encode(mtmd_context * ctx, // get output embeddings from the last encode pass MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); -// returns a copy of output embeddings from the last encode pass, of size n_embd_out -MTMD_API float * mtmd_get_output_embd_copy(mtmd_context * ctx, size_t * n_embd_out); - ///////////////////////////////////////// // @@ -237,14 +232,15 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, llama_pos * new_n_past); // helper function to decode an image whose embeddings have already been calculated -MTMD_API int32_t mtmd_helper_decode_image(mtmd_context *ctx, - struct llama_context *lctx, - const mtmd_image_tokens *image_tokens, - float *embd, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - llama_pos *new_n_past); +// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure +MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context *ctx, + struct llama_context *lctx, + const mtmd_input_chunk * chunk, + float *embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos *new_n_past); ///////////////////////////////////////// @@ -283,11 +279,6 @@ struct mtmd_input_chunk_deleter { }; using input_chunk_ptr = std::unique_ptr; -struct mtmd_image_tokens_deleter { - void operator()(mtmd_image_tokens * val) { mtmd_image_tokens_free(val); } -}; -using image_tokens_ptr = std::unique_ptr; - struct bitmap { bitmap_ptr ptr; bitmap() : ptr(nullptr) {} From 4d17bfc0beb1fa35f25628e6ee00e32953839a7d Mon Sep 17 00:00:00 2001 From: Matt Clayton Date: Thu, 8 May 2025 13:16:18 -0400 Subject: [PATCH 3/3] Cleanups --- tools/mtmd/mtmd.cpp | 11 +++++------ tools/mtmd/mtmd.h | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 78a0e1db8e1f1..5d18e8929b31f 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -585,21 +585,20 @@ int32_t mtmd_helper_decode_image_chunk( mtmd_context * ctx, struct llama_context * lctx, const mtmd_input_chunk * chunk, - float * embd, + float * encoded_embd, llama_pos n_past, llama_seq_id seq_id, int32_t n_batch, llama_pos * new_n_past) { - - if (chunk->type != MTMD_INPUT_CHUNK_TYPE_IMAGE) { + if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) { LOG_ERR("failed to decode image chunk: input chunk not of image type\n"); return -1; } - if (!chunk->tokens_image) { + const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk); + if (!image_tokens) { LOG_ERR("failed to decode image chunk: image tokens are null\n"); return -1; } - const auto image_tokens = chunk->tokens_image.get(); int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1; @@ -607,7 +606,7 @@ int32_t mtmd_helper_decode_image_chunk( int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens); int32_t i_batch = 0; int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; - decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd); + decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd); const int nx = mtmd_image_tokens_get_nx(image_tokens); const int ny = mtmd_image_tokens_get_ny(image_tokens); diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 8b0cf884573b1..54cf481b6aa94 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -232,15 +232,16 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, llama_pos * new_n_past); // helper function to decode an image whose embeddings have already been calculated +// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure -MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context *ctx, - struct llama_context *lctx, +MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, + struct llama_context * lctx, const mtmd_input_chunk * chunk, - float *embd, + float * encoded_embd, llama_pos n_past, llama_seq_id seq_id, int32_t n_batch, - llama_pos *new_n_past); + llama_pos * new_n_past); /////////////////////////////////////////