llama-cli : add inital GPU sampling support

danbev · danbev · commit c0ac70c79b45 · 2025-11-07T10:50:50.000+01:00
This commit adds initial support for GPU sampling in llama-cli.

Options:
```console
$ ./build/bin/llama-cli --help
----- sampling params -----
...
--gpu-sampling                          enable GPU sampling (default: disabled)
--gpu-top-k N                           GPU top-k sampling (default: 40, &lt;= 0 = disabled)
--gpu-top-p-approx-k N                  GPU top-p approximation using top-k (default: 0, 0 = disabled)
--gpu-temp N                            GPU temperature (default: 0.80, 0.0 = disabled, greedy sampling)
--gpu-softmax                           add GPU softmax to sampling chain (default: disabled)
--gpu-dist                              add GPU dist (final sampling) to sampling chain (default: disabled)
```

Usage:
```console
$ ./build/bin/llama-cli -m models/Qwen2.5-VL-3B-Instruct-Q8_0.gguf \
    -p "What is the Capital of Sweden?" \
    --gpu-sampling \
    --gpu-temp 0 \
    --gpu-top-k 20 \
    --gpu-dist \
    -ngl 99 \
    -no-cnv \
    -n 20 \
    --no-warmup
```
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2477,6 +2477,49 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-sampling"},
+        "enable GPU sampling (default: disabled)",
+        [](common_params & params) {
+            params.sampling.gpu_sampling = true;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-top-k"}, "N",
+        string_format("GPU top-k sampling (default: %d, <= 0 = disabled)", params.sampling.gpu_top_k),
+        [](common_params & params, int value) {
+            params.sampling.gpu_top_k = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-top-p-approx-k"}, "N",
+        string_format("GPU top-p approximation using top-k (default: %d, 0 = disabled)", params.sampling.gpu_top_p_approx_k),
+        [](common_params & params, int value) {
+            params.sampling.gpu_top_p_approx_k = value;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-temp"}, "N",
+        string_format("GPU temperature (default: %.2f, 0.0 = disabled, greedy sampling)", (double)params.sampling.gpu_temp),
+        [](common_params & params, const std::string & value) {
+            params.sampling.gpu_temp = std::stof(value);
+            params.sampling.gpu_temp = std::max(params.sampling.gpu_temp, 0.0f);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-softmax"},
+        "add GPU softmax to sampling chain (default: disabled)",
+        [](common_params & params) {
+            params.sampling.gpu_softmax = true;
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--gpu-dist"},
+        "add GPU dist (final sampling) to sampling chain (default: disabled)",
+        [](common_params & params) {
+            params.sampling.gpu_dist = true;
+        }
+    ).set_sparam());
     add_opt(common_arg(
         {"--pooling"}, "{none,mean,cls,last,rank}",
         "pooling type for embeddings, use model default if unspecified",
diff --git a/common/common.cpp b/common/common.cpp
@@ -8,6 +8,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"
 
 #include <algorithm>
 #include <cinttypes>
@@ -927,6 +928,8 @@ struct common_init_result common_init_from_params(common_params & params) {
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     auto cparams = common_context_params_to_llama(params);
+    cparams.samplers = params.gpu_samplers;
+    cparams.n_samplers = params.n_gpu_samplers;
 
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
diff --git a/common/common.h b/common/common.h
@@ -188,6 +188,14 @@ struct common_params_sampling {
     std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
     std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
 
+    // GPU sampling parameters
+    bool    gpu_sampling        = false; // enable GPU sampling
+    int32_t gpu_top_k           = 40;    // GPU top-k (<= 0 to disable)
+    int32_t gpu_top_p_approx_k  = 0;     // GPU top-p approximation using top-k (0 = disabled)
+    float   gpu_temp            = 0.80f; // GPU temperature (0.0 = disabled, greedy sampling)
+    bool    gpu_softmax         = false; // add GPU softmax to chain
+    bool    gpu_dist            = false; // add GPU dist (final sampling) to chain
+
     // print the parameters into a string
     std::string print() const;
 };
@@ -511,6 +519,9 @@ struct common_params {
     bool has_speculative() const {
         return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
     }
+
+    struct llama_sampler_seq_config * gpu_samplers;
+    size_t                            n_gpu_samplers;
 };
 
 // call once at the start of a program if it uses libcommon
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -113,17 +113,51 @@ struct common_sampler {
     llama_token_data_array cur_p;
 
     void set_logits(struct llama_context * ctx, int idx) {
-        const auto * logits = llama_get_logits_ith(ctx, idx);
+        const float *       sampled_probs        = llama_get_sampled_probs_ith(ctx, idx);
+        const float *       sampled_logits       = llama_get_sampled_logits_ith(ctx, idx);
+        const llama_token * sampled_ids          = llama_get_sampled_token_ids_ith(ctx, idx);
 
         const llama_model * model = llama_get_model(ctx);
         const llama_vocab * vocab = llama_model_get_vocab(model);
 
         const int n_vocab = llama_vocab_n_tokens(vocab);
 
-        cur.resize(n_vocab);
+        // Use the member variable instead of allocating locally
+        cur.clear();
 
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+        if (sampled_probs) {
+            const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
+            cur.reserve(sampled_probs_count);
+            // The GPU sampler has filtered the probabilities so we need to use the sampled ids.
+            if (sampled_ids != nullptr) {
+                for (uint32_t i = 0; i < sampled_probs_count; ++i) {
+                    cur.emplace_back(llama_token_data{sampled_ids[i], 0.0f, sampled_probs[i]});
+                }
+            } else {
+                for (llama_token token_id = 0; token_id < (int) sampled_probs_count; token_id++) {
+                    cur.emplace_back(llama_token_data{token_id, 0.0f, sampled_probs[token_id]});
+                }
+            }
+        } else if (sampled_logits) {
+            const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
+            cur.reserve(sampled_logits_count);
+            // The GPU sampler has filtered the logits so we need to use the sampled ids.
+            if (sampled_ids != nullptr) {
+                for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
+                    cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
+                }
+            } else {
+                for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
+                    cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
+                }
+            }
+        } else {
+            const auto * logits = llama_get_logits_ith(ctx, idx);
+            GGML_ASSERT(logits != nullptr);
+            cur.reserve(n_vocab);
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+            }
         }
 
         cur_p = { cur.data(), cur.size(), -1, false };
@@ -287,6 +321,42 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
     return result;
 }
 
+struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params) {
+    GGML_UNUSED(model);
+
+    llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
+    chain_params.no_perf = params.no_perf;
+
+    struct llama_sampler * chain = llama_sampler_chain_init(chain_params);
+
+    if (!params.gpu_sampling) {
+        return chain; // return empty chain
+    }
+
+    if (params.gpu_temp > 0.0f) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_temp(params.gpu_temp));
+    }
+
+    if (params.gpu_top_k > 0) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_k(params.gpu_top_k));
+    }
+
+    // TODO: GPU top_p is an approximation using top_k at the moment
+    if (params.gpu_top_p_approx_k > 0) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_p(params.gpu_top_p_approx_k));
+    }
+
+    if (params.gpu_softmax) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_softmax());
+    }
+
+    if (params.gpu_dist) {
+        llama_sampler_chain_add(chain, llama_sampler_gpu_init_dist(params.seed));
+    }
+
+    return chain;
+}
+
 void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
         llama_sampler_free(gsmpl->grmr);
@@ -337,6 +407,13 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 }
 
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    // Check if a GPU sampler has already sampled a token in which case we
+    // return that token id directly.
+    const llama_token gpu_sampled_token  = llama_get_sampled_token_ith(ctx, idx);
+    if (gpu_sampled_token != LLAMA_TOKEN_NULL) {
+        return gpu_sampled_token;
+    }
+
     gsmpl->set_logits(ctx, idx);
 
     auto & grmr  = gsmpl->grmr;
diff --git a/common/sampling.h b/common/sampling.h
@@ -38,6 +38,13 @@ struct common_sampler;
 
 struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
 
+// Create a GPU sampler chain from common sampling parameters
+// Returns a llama_sampler chain configured with GPU samplers based on the parameters
+// This chain can be used per-sequence for GPU-based sampling
+// Note: Only samplers that have GPU equivalents will be added to the chain
+// The returned sampler should be freed with llama_sampler_free()
+struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params);
+
 void common_sampler_free(struct common_sampler * gsmpl);
 
 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
diff --git a/include/llama.h b/include/llama.h
@@ -1367,6 +1367,30 @@ extern "C" {
     //
     LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
 
+    //
+    // GPU samplers
+    //
+
+    /// @details Greedy sampling on GPU - always selects the token with the highest probability
+    LLAMA_API struct llama_sampler * llama_sampler_gpu_init_greedy(void);
+
+    /// @details Temperature scaling on GPU - scales logits by 1/temperature
+    LLAMA_API struct llama_sampler * llama_sampler_gpu_init_temp(float temp);
+
+    /// @details Softmax normalization on GPU - converts logits to probabilities
+    LLAMA_API struct llama_sampler * llama_sampler_gpu_init_softmax(void);
+
+    /// @details Top-K filtering on GPU - keeps only the k tokens with highest probabilities
+    LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_k(int32_t k);
+
+    /// @details Top-P approximation on GPU using top-k (not true top-p sampling)
+    /// This is an approximation that uses top-k to simulate top-p behavior
+    /// TODO: implement true top-p sampling on GPU
+    LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_p(int32_t k);
+
+    /// @details Distribution sampling on GPU - final sampling step that selects a token
+    LLAMA_API struct llama_sampler * llama_sampler_gpu_init_dist(uint32_t seed);
+
     // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
     LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
 
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
@@ -135,6 +135,13 @@ int main(int argc, char ** argv) {
 
     std::vector<common_chat_msg> chat_msgs;
 
+    // Configure GPU Sampler
+    std::vector<llama_sampler_seq_config> gpu_samplers = {
+        { 0, common_sampler_gpu_init(model, sparams) }
+    };
+    params.gpu_samplers   = gpu_samplers.data();
+    params.n_gpu_samplers = gpu_samplers.size();
+
     // load the model and apply lora adapter, if any
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
     common_init_result llama_init = common_init_from_params(params);