From 6f7a55c4590d34c05503bdb9a9b024a6ed29ed9d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 27 Apr 2025 20:25:41 +0200
Subject: [PATCH 1/7] llava : add clip_n_output_tokens, deprecate
 clip_n_patches

---
 examples/llava/clip.cpp  | 32 +++++++++++++++++++++++++++++---
 examples/llava/clip.h    | 17 ++++++++++++++---
 examples/llava/llava.cpp | 15 ++++++++-------
 examples/llava/mtmd.cpp  | 23 ++++++++++++++++-------
 4 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd27d5b17a08..d4ccf57bd5ede 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2835,15 +2835,18 @@ void clip_free(clip_ctx * ctx) {
     delete ctx;
 }
 
+// deprecated
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    const int32_t nx = ctx->vision_model.hparams.image_size;
+    const int32_t ny = ctx->vision_model.hparams.image_size;
+    return clip_embd_nbytes_by_img(ctx, ny, nx);
 }
 
 size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
     clip_image_f32 img;
     img.nx = img_w;
     img.ny = img_h;
-    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2873,14 +2876,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_grid_pinpoints.size();
 }
 
+// deprecated
 int clip_n_patches(const struct clip_ctx * ctx) {
     clip_image_f32 img;
     img.nx = ctx->vision_model.hparams.image_size;
     img.ny = ctx->vision_model.hparams.image_size;
-    return clip_n_patches_by_img(ctx, &img);
+    return clip_n_output_tokens(ctx, &img);
 }
 
+// deprecated
 int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    return clip_n_output_tokens(ctx, img);
+}
+
+int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    const auto & params = ctx->vision_model.hparams;
+    const int n_total = clip_n_output_tokens(ctx, img);
+    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+        return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
+    }
+    return n_total;
+}
+
+int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    const auto & params = ctx->vision_model.hparams;
+    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+        return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
+    }
+    return 1;
+}
+
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->vision_model.hparams;
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 6ba42ad892146..1cf3d8dfbe439 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 
-CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
-CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
+GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
+    "use clip_n_output_tokens instead");
+GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
+    "use clip_n_output_tokens instead");
+
+CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// for M-RoPE, this will be the number of token positions in X and Y directions
+// for other models, X will be the total number of tokens and Y will be 1
+CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// this should be equal to the embedding dimension of the text model
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 03a22cbb4c205..c00d16aefff10 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 }
 
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
-static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
     struct {
         struct ggml_context * ctx;
     } model;
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     model.ctx = ggml_init(params);
 
-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base
     for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
-    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+    memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
 
     // Debug: Test single segments
     // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                 image_embd_v[i],
                 clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
+            n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
         }
         *n_img_pos = n_img_pos_out;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     }
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
-        *n_img_pos = clip_n_patches(ctx_clip);
         clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+        *n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
         bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
         if (!encoded) {
             LOG_ERR("Unable to encode image\n");
@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
 
         int n_img_pos_out;
-        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
         *n_img_pos = n_img_pos_out;
 
         for (size_t i = 0; i < image_embd_v.size(); i++) {
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index a994ef0166e6a..2fe252b8cfe0d 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -40,11 +40,14 @@ struct mtmd_context {
     llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice
     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
 
+    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
+
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
                    const llama_model * text_model,
                    const mtmd_context_params & ctx_params) :
+        text_model   (text_model),
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
         image_marker (ctx_params.image_marker)
@@ -56,9 +59,8 @@ struct mtmd_context {
         if (!ctx_clip) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
-        this->text_model = text_model;
 
-        GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
+        use_mrope = clip_is_qwen2vl(ctx_clip);
 
         int minicpmv_version = clip_is_minicpmv(ctx_clip);
         if (minicpmv_version == 2) {
@@ -229,7 +231,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
         for (auto & entry : batch_f32.entries) {
             mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
             image_tokens->ny = 1;
             image_tokens->batch_f32.entries.push_back(std::move(entry));
             image_tokens->id = id;
@@ -325,12 +327,19 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             } else {
                 size_t n_tokens = 0;
                 for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
+                    n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
                 }
 
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                image_tokens->nx = n_tokens;
-                image_tokens->ny = 1; // TODO
+                if (ctx->use_mrope) {
+                    // for Qwen2VL, we need this information for M-RoPE decoding positions
+                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
+                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
+                } else {
+                    // other models, we only need the total number of tokens
+                    image_tokens->nx = n_tokens;
+                    image_tokens->ny = 1;
+                }
                 image_tokens->batch_f32 = std::move(batch_f32);
                 image_tokens->id = bitmaps[i_img].id; // optional
 
@@ -397,7 +406,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         for (size_t i = 0; i < entries.size(); i++) {
-            int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
+            int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
             ok = clip_image_encode(
                 ctx->ctx_clip,
                 ctx->n_threads,

From 8742f8a24e0bf00440fa2579e682b27460bcc6cb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 27 Apr 2025 20:44:41 +0200
Subject: [PATCH 2/7] mtmd : add qwen2vl and qwen2.5vl

---
 examples/llava/mtmd.cpp | 40 +++++++++++++++++++++++++++++++++-------
 examples/llava/mtmd.h   |  3 +++
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 2fe252b8cfe0d..5c7c3b78a3ba7 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -204,6 +204,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
+    else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
+        // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+        marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+
+    }
+
     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
     // for glm-edge, we don't need to add because the tokens are already in the returned embeddings
 
@@ -445,14 +452,16 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
 struct decode_embd_batch {
+    int n_pos_per_embd;
+    int n_mmproj_embd;
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id>   seq_id_0;
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
+    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+        pos     .resize(n_tokens * n_pos_per_embd);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
         logits  .resize(n_tokens);
@@ -475,6 +484,18 @@ struct decode_embd_batch {
             batch.logits  [i] = false;
         }
     }
+
+    llama_batch get_view(int offset, int n_tokens) {
+        return {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
+            /*pos            =*/ batch.pos      + offset * n_pos_per_embd,
+            /*n_seq_id       =*/ batch.n_seq_id + offset,
+            /*seq_id         =*/ batch.seq_id   + offset,
+            /*logits         =*/ batch.logits   + offset,
+        };
+    }
 };
 
 int32_t mtmd_helper_eval(mtmd_context * ctx,
@@ -487,6 +508,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
     llama_pos n_past = pos0;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
 
     for (auto & chunk : chunks) {
         bool is_last = &chunk == &chunks.back();
@@ -534,6 +556,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             int32_t i_batch = 0;
             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
             float * embd = mtmd_get_output_embd(ctx);
+            decode_embd_batch batch_embd(embd, n_tokens, n_past, seq_id, n_pos_per_embd, n_mmproj_embd);
 
             if (mtmd_decode_use_non_causal(ctx)) {
                 llama_set_causal_attn(lctx, false);
@@ -541,15 +564,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             }
 
             while (i_batch < n_img_batches) { // split into batches
-                int32_t pos_offset = i_batch*n_batch;
-                int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
-                float * embd_batch = embd + pos_offset*n_mmproj_embd;
-                decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
+                int pos_offset = i_batch*n_batch;
+                int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+                llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
 
                 printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
 
                 int64_t t1 = ggml_time_ms();
-                ret = llama_decode(lctx, batch_img.batch);
+                ret = llama_decode(lctx, batch_embd_view);
                 if (ret != 0) {
                     LOG_ERR("failed to decode image\n");
                     llama_set_causal_attn(lctx, true); // restore causal attn
@@ -612,6 +634,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
     return false;
 }
 
+bool mtmd_decode_use_mrope(mtmd_context * ctx) {
+    return ctx->use_mrope;
+}
+
 void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
     mtmd_image_tokens_free(val);
 }
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 78be192dd6eb6..02c77c57a7da7 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -114,6 +114,9 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 // whether we need to set non-causal mask before llama_decode
 MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 
+// whether the current model use M-RoPE for llama_decode
+MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+
 
 
 //

From 8646e36489cda248755ebd375e819a30908f0dab Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sun, 27 Apr 2025 21:37:18 +0200
Subject: [PATCH 3/7] decode_embd_batch::set_position_...

---
 examples/llava/mtmd.cpp | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 5c7c3b78a3ba7..76d80cc6d285c 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -460,13 +460,12 @@ struct decode_embd_batch {
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
         pos     .resize(n_tokens * n_pos_per_embd);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
         logits  .resize(n_tokens);
         seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
         seq_ids [n_tokens] = nullptr;
         batch = {
             /*n_tokens       =*/ n_tokens,
@@ -477,7 +476,11 @@ struct decode_embd_batch {
             /*seq_id         =*/ seq_ids.data(),
             /*logits         =*/ logits.data(),
         };
-        for (int i = 0; i < n_tokens; i++) {
+    }
+
+    void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < batch.n_tokens; i++) {
             batch.pos     [i] = pos_0 + i;
             batch.n_seq_id[i] = 1;
             batch.seq_id  [i] = seq_id_0.data();
@@ -485,6 +488,11 @@ struct decode_embd_batch {
         }
     }
 
+    void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+        seq_id_0[0] = seq_id;
+        GGML_ABORT("TODO");
+    }
+
     llama_batch get_view(int offset, int n_tokens) {
         return {
             /*n_tokens       =*/ n_tokens,
@@ -556,7 +564,15 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             int32_t i_batch = 0;
             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
             float * embd = mtmd_get_output_embd(ctx);
-            decode_embd_batch batch_embd(embd, n_tokens, n_past, seq_id, n_pos_per_embd, n_mmproj_embd);
+            decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+
+            if (mtmd_decode_use_mrope(ctx)) {
+                int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
+                int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
+                batch_embd.set_position_mrope(pos0, nx, ny, seq_id);
+            } else {
+                batch_embd.set_position_normal(pos0, seq_id);
+            }
 
             if (mtmd_decode_use_non_causal(ctx)) {
                 llama_set_causal_attn(lctx, false);

From b30358409a51d0c894adfa550e88759b85d936c3 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 16:12:25 +0200
Subject: [PATCH 4/7] working version

---
 examples/llava/mtmd-cli.cpp | 36 +-----------------
 examples/llava/mtmd.cpp     | 75 ++++++++++++++++++++++++++++++++-----
 examples/llava/mtmd.h       |  6 ++-
 examples/llava/tests.sh     |  4 +-
 4 files changed, 75 insertions(+), 46 deletions(-)

diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index 250e8c9a9e871..4d857ca64e0b4 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -136,39 +136,6 @@ struct mtmd_cli_context {
     }
 };
 
-struct decode_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
     llama_tokens generated_tokens;
     for (int i = 0; i < n_predict; i++) {
@@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
         return 1;
     }
 
-    ctx.n_past += mtmd_helper_get_n_tokens(chunks);
+    ctx.n_past += mtmd_helper_get_n_pos(chunks);
 
     return 0;
 }
@@ -371,6 +338,7 @@ int main(int argc, char ** argv) {
         }
     }
     if (g_is_interrupted) LOG("\nInterrupted by user\n");
+    LOG("\n\n");
     llama_perf_context_print(ctx.lctx);
     return g_is_interrupted ? 130 : 0;
 }
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 76d80cc6d285c..9924a61f848c3 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -128,6 +128,7 @@ struct mtmd_image_tokens_data {
 struct mtmd_image_tokens {
     uint32_t nx; // number of tokens in x direction
     uint32_t ny; // number of tokens in y direction
+    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
     uint32_t n_tokens() const { return nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -342,6 +343,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     // for Qwen2VL, we need this information for M-RoPE decoding positions
                     image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
                     image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
+                    image_tokens->use_mrope_pos = true;
                 } else {
                     // other models, we only need the total number of tokens
                     image_tokens->nx = n_tokens;
@@ -396,6 +398,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
     return image_tokens->id;
 }
 
+llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
+    if (image_tokens->use_mrope_pos) {
+        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
+    }
+    return image_tokens->n_tokens();
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -441,7 +450,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             n_tokens += chunk.tokens_text.size();
         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            n_tokens += chunk.tokens_image->n_tokens();
+            n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
         } else {
             GGML_ASSERT(false && "chunk type not supported");
         }
@@ -449,12 +458,27 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
     return n_tokens;
 }
 
+llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
+    llama_pos n_pos = 0;
+    for (auto & chunk : chunks) {
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            n_pos += chunk.tokens_text.size();
+        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+        } else {
+            GGML_ASSERT(false && "chunk type not supported");
+        }
+    }
+    return n_pos;
+}
+
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
 struct decode_embd_batch {
     int n_pos_per_embd;
     int n_mmproj_embd;
     std::vector<llama_pos>      pos;
+    std::vector<llama_pos>      pos_view; // used by mrope
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id>   seq_id_0;
     std::vector<llama_seq_id *> seq_ids;
@@ -489,16 +513,46 @@ struct decode_embd_batch {
     }
 
     void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
         seq_id_0[0] = seq_id;
-        GGML_ABORT("TODO");
+        for (int y = 0; y < ny; y++) {
+            for (int x = 0; x < nx; x++) {
+                int i = y * nx + x;
+                pos[i                     ] = pos_0;
+                pos[i + batch.n_tokens    ] = pos_0 + y;
+                pos[i + batch.n_tokens * 2] = pos_0 + x;
+                pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+            }
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
     }
 
     llama_batch get_view(int offset, int n_tokens) {
+        llama_pos * pos_ptr;
+        pos_view.clear();
+        pos_view.resize(n_tokens * n_pos_per_embd);
+        if (n_pos_per_embd > 1) {
+            // mrope
+            // for example, with layout of src: 1234...1234...1234...1234...
+            //       offset 2 will give us dst: 34...34...34...34...
+            for (int i = 0; i < n_pos_per_embd; i++) {
+                auto src = pos.begin() + i * batch.n_tokens + offset;
+                pos_view.insert(pos_view.end(), src, src + n_tokens);
+            }
+            pos_ptr = pos_view.data();
+        } else {
+            // normal
+            pos_ptr = pos.data() + offset;
+        }
         return {
             /*n_tokens       =*/ n_tokens,
             /*tokens         =*/ nullptr,
             /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
-            /*pos            =*/ batch.pos      + offset * n_pos_per_embd,
+            /*pos            =*/ pos_ptr,
             /*n_seq_id       =*/ batch.n_seq_id + offset,
             /*seq_id         =*/ batch.seq_id   + offset,
             /*logits         =*/ batch.logits   + offset,
@@ -566,12 +620,13 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
 
+            const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
+            const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
+
             if (mtmd_decode_use_mrope(ctx)) {
-                int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
-                int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
-                batch_embd.set_position_mrope(pos0, nx, ny, seq_id);
+                batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
             } else {
-                batch_embd.set_position_normal(pos0, seq_id);
+                batch_embd.set_position_normal(n_past, seq_id);
             }
 
             if (mtmd_decode_use_non_causal(ctx)) {
@@ -584,7 +639,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
                 llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
 
-                printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+                LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
 
                 int64_t t1 = ggml_time_ms();
                 ret = llama_decode(lctx, batch_embd_view);
@@ -600,9 +655,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 }
 
                 i_batch++;
-                n_past += n_tokens_batch;
             }
 
+            // for mrope, one image is one single **temporal** position
+            n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
+
             if (mtmd_decode_use_non_causal(ctx)) {
                 llama_set_causal_attn(lctx, true);
             }
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 02c77c57a7da7..6805e5e4816c3 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -102,6 +102,7 @@ MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im
 MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
 MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
 MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos   mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
 MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
@@ -123,9 +124,12 @@ MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
 // helper functions (can be implemented based on other functions)
 //
 
-// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
 
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
+
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh
index 4002f9d531bd2..75604315cfeba 100755
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -54,8 +54,8 @@ add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
-add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
-add_test "llama-qwen2vl-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+add_test "llama-mtmd-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+add_test "llama-mtmd-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
 
 # to test the big models, run: ./tests.sh big
 add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"

From d23fdc280f355ddca12118c059fe5e13955c4631 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 16:54:39 +0200
Subject: [PATCH 5/7] deprecate llama-qwen2vl-cli

---
 examples/llava/CMakeLists.txt                        | 8 +-------
 examples/llava/{qwen2vl-cli.cpp => qwen2vl-test.cpp} | 2 ++
 2 files changed, 3 insertions(+), 7 deletions(-)
 rename examples/llava/{qwen2vl-cli.cpp => qwen2vl-test.cpp} (99%)

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 6409b4f5e6cd0..27b6d27e5cac3 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -64,13 +64,7 @@ endif()
 add_executable(llama-llava-cli    deprecation-warning.cpp)
 add_executable(llama-gemma3-cli   deprecation-warning.cpp)
 add_executable(llama-minicpmv-cli deprecation-warning.cpp)
-
-set(TARGET llama-qwen2vl-cli)
-add_executable(${TARGET} qwen2vl-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
 
 set(TARGET llama-mtmd-cli)
 add_executable(${TARGET} mtmd-cli.cpp)
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-test.cpp
similarity index 99%
rename from examples/llava/qwen2vl-cli.cpp
rename to examples/llava/qwen2vl-test.cpp
index 1e54851ea07a0..7f9e3dca885c6 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-test.cpp
@@ -27,6 +27,8 @@
 #include <cassert>
 #include <cmath>
 
+// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
+// IT IS NOT A PRODUCTION CODE
 
 static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
                                      int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {

From 496f1ce8e27997370f481d67ff708a24c97b7629 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 10:44:47 +0200
Subject: [PATCH 6/7] correct order W, H of clip_embd_nbytes_by_img

---
 examples/llava/clip.cpp | 4 ++--
 examples/llava/clip.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 26964005af342..ad3e7df1d8a3a 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2829,10 +2829,10 @@ void clip_free(clip_ctx * ctx) {
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     const int32_t nx = ctx->vision_model.hparams.image_size;
     const int32_t ny = ctx->vision_model.hparams.image_size;
-    return clip_embd_nbytes_by_img(ctx, ny, nx);
+    return clip_embd_nbytes_by_img(ctx, nx, ny);
 }
 
-size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
     clip_image_f32 img;
     img.nx = img_w;
     img.ny = img_h;
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 1cf3d8dfbe439..0a53bd8eb78e1 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
+CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
 
 CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);

From db85de105ff789fac6e948e9bc7c86619fc9c349 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 10:52:00 +0200
Subject: [PATCH 7/7] edit existing line in hot topics

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1785493c3e2b0..42c0eb633ef5d 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ## Hot topics
 
 - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
-- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim