Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c0ac70c

Browse files
committed
llama-cli : add inital GPU sampling support
This commit adds initial support for GPU sampling in llama-cli. Options: ```console $ ./build/bin/llama-cli --help ----- sampling params ----- ... --gpu-sampling enable GPU sampling (default: disabled) --gpu-top-k N GPU top-k sampling (default: 40, <= 0 = disabled) --gpu-top-p-approx-k N GPU top-p approximation using top-k (default: 0, 0 = disabled) --gpu-temp N GPU temperature (default: 0.80, 0.0 = disabled, greedy sampling) --gpu-softmax add GPU softmax to sampling chain (default: disabled) --gpu-dist add GPU dist (final sampling) to sampling chain (default: disabled) ``` Usage: ```console $ ./build/bin/llama-cli -m models/Qwen2.5-VL-3B-Instruct-Q8_0.gguf \ -p "What is the Capital of Sweden?" \ --gpu-sampling \ --gpu-temp 0 \ --gpu-top-k 20 \ --gpu-dist \ -ngl 99 \ -no-cnv \ -n 20 \ --no-warmup ```
1 parent 0bf740e commit c0ac70c

File tree

7 files changed

+176
-4
lines changed

7 files changed

+176
-4
lines changed

common/arg.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2477,6 +2477,49 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24772477
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
24782478
}
24792479
).set_sparam());
2480+
add_opt(common_arg(
2481+
{"--gpu-sampling"},
2482+
"enable GPU sampling (default: disabled)",
2483+
[](common_params & params) {
2484+
params.sampling.gpu_sampling = true;
2485+
}
2486+
).set_sparam());
2487+
add_opt(common_arg(
2488+
{"--gpu-top-k"}, "N",
2489+
string_format("GPU top-k sampling (default: %d, <= 0 = disabled)", params.sampling.gpu_top_k),
2490+
[](common_params & params, int value) {
2491+
params.sampling.gpu_top_k = value;
2492+
}
2493+
).set_sparam());
2494+
add_opt(common_arg(
2495+
{"--gpu-top-p-approx-k"}, "N",
2496+
string_format("GPU top-p approximation using top-k (default: %d, 0 = disabled)", params.sampling.gpu_top_p_approx_k),
2497+
[](common_params & params, int value) {
2498+
params.sampling.gpu_top_p_approx_k = value;
2499+
}
2500+
).set_sparam());
2501+
add_opt(common_arg(
2502+
{"--gpu-temp"}, "N",
2503+
string_format("GPU temperature (default: %.2f, 0.0 = disabled, greedy sampling)", (double)params.sampling.gpu_temp),
2504+
[](common_params & params, const std::string & value) {
2505+
params.sampling.gpu_temp = std::stof(value);
2506+
params.sampling.gpu_temp = std::max(params.sampling.gpu_temp, 0.0f);
2507+
}
2508+
).set_sparam());
2509+
add_opt(common_arg(
2510+
{"--gpu-softmax"},
2511+
"add GPU softmax to sampling chain (default: disabled)",
2512+
[](common_params & params) {
2513+
params.sampling.gpu_softmax = true;
2514+
}
2515+
).set_sparam());
2516+
add_opt(common_arg(
2517+
{"--gpu-dist"},
2518+
"add GPU dist (final sampling) to sampling chain (default: disabled)",
2519+
[](common_params & params) {
2520+
params.sampling.gpu_dist = true;
2521+
}
2522+
).set_sparam());
24802523
add_opt(common_arg(
24812524
{"--pooling"}, "{none,mean,cls,last,rank}",
24822525
"pooling type for embeddings, use model default if unspecified",

common/common.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "common.h"
99
#include "log.h"
1010
#include "llama.h"
11+
#include "sampling.h"
1112

1213
#include <algorithm>
1314
#include <cinttypes>
@@ -927,6 +928,8 @@ struct common_init_result common_init_from_params(common_params & params) {
927928
const llama_vocab * vocab = llama_model_get_vocab(model);
928929

929930
auto cparams = common_context_params_to_llama(params);
931+
cparams.samplers = params.gpu_samplers;
932+
cparams.n_samplers = params.n_gpu_samplers;
930933

931934
llama_context * lctx = llama_init_from_model(model, cparams);
932935
if (lctx == NULL) {

common/common.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,14 @@ struct common_params_sampling {
188188
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
189189
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
190190

191+
// GPU sampling parameters
192+
bool gpu_sampling = false; // enable GPU sampling
193+
int32_t gpu_top_k = 40; // GPU top-k (<= 0 to disable)
194+
int32_t gpu_top_p_approx_k = 0; // GPU top-p approximation using top-k (0 = disabled)
195+
float gpu_temp = 0.80f; // GPU temperature (0.0 = disabled, greedy sampling)
196+
bool gpu_softmax = false; // add GPU softmax to chain
197+
bool gpu_dist = false; // add GPU dist (final sampling) to chain
198+
191199
// print the parameters into a string
192200
std::string print() const;
193201
};
@@ -511,6 +519,9 @@ struct common_params {
511519
bool has_speculative() const {
512520
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
513521
}
522+
523+
struct llama_sampler_seq_config * gpu_samplers;
524+
size_t n_gpu_samplers;
514525
};
515526

516527
// call once at the start of a program if it uses libcommon

common/sampling.cpp

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,17 +113,51 @@ struct common_sampler {
113113
llama_token_data_array cur_p;
114114

115115
void set_logits(struct llama_context * ctx, int idx) {
116-
const auto * logits = llama_get_logits_ith(ctx, idx);
116+
const float * sampled_probs = llama_get_sampled_probs_ith(ctx, idx);
117+
const float * sampled_logits = llama_get_sampled_logits_ith(ctx, idx);
118+
const llama_token * sampled_ids = llama_get_sampled_token_ids_ith(ctx, idx);
117119

118120
const llama_model * model = llama_get_model(ctx);
119121
const llama_vocab * vocab = llama_model_get_vocab(model);
120122

121123
const int n_vocab = llama_vocab_n_tokens(vocab);
122124

123-
cur.resize(n_vocab);
125+
// Use the member variable instead of allocating locally
126+
cur.clear();
124127

125-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
126-
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
128+
if (sampled_probs) {
129+
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
130+
cur.reserve(sampled_probs_count);
131+
// The GPU sampler has filtered the probabilities so we need to use the sampled ids.
132+
if (sampled_ids != nullptr) {
133+
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
134+
cur.emplace_back(llama_token_data{sampled_ids[i], 0.0f, sampled_probs[i]});
135+
}
136+
} else {
137+
for (llama_token token_id = 0; token_id < (int) sampled_probs_count; token_id++) {
138+
cur.emplace_back(llama_token_data{token_id, 0.0f, sampled_probs[token_id]});
139+
}
140+
}
141+
} else if (sampled_logits) {
142+
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
143+
cur.reserve(sampled_logits_count);
144+
// The GPU sampler has filtered the logits so we need to use the sampled ids.
145+
if (sampled_ids != nullptr) {
146+
for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
147+
cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
148+
}
149+
} else {
150+
for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
151+
cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
152+
}
153+
}
154+
} else {
155+
const auto * logits = llama_get_logits_ith(ctx, idx);
156+
GGML_ASSERT(logits != nullptr);
157+
cur.reserve(n_vocab);
158+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
159+
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
160+
}
127161
}
128162

129163
cur_p = { cur.data(), cur.size(), -1, false };
@@ -287,6 +321,42 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
287321
return result;
288322
}
289323

324+
struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params) {
325+
GGML_UNUSED(model);
326+
327+
llama_sampler_chain_params chain_params = llama_sampler_chain_default_params();
328+
chain_params.no_perf = params.no_perf;
329+
330+
struct llama_sampler * chain = llama_sampler_chain_init(chain_params);
331+
332+
if (!params.gpu_sampling) {
333+
return chain; // return empty chain
334+
}
335+
336+
if (params.gpu_temp > 0.0f) {
337+
llama_sampler_chain_add(chain, llama_sampler_gpu_init_temp(params.gpu_temp));
338+
}
339+
340+
if (params.gpu_top_k > 0) {
341+
llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_k(params.gpu_top_k));
342+
}
343+
344+
// TODO: GPU top_p is an approximation using top_k at the moment
345+
if (params.gpu_top_p_approx_k > 0) {
346+
llama_sampler_chain_add(chain, llama_sampler_gpu_init_top_p(params.gpu_top_p_approx_k));
347+
}
348+
349+
if (params.gpu_softmax) {
350+
llama_sampler_chain_add(chain, llama_sampler_gpu_init_softmax());
351+
}
352+
353+
if (params.gpu_dist) {
354+
llama_sampler_chain_add(chain, llama_sampler_gpu_init_dist(params.seed));
355+
}
356+
357+
return chain;
358+
}
359+
290360
void common_sampler_free(struct common_sampler * gsmpl) {
291361
if (gsmpl) {
292362
llama_sampler_free(gsmpl->grmr);
@@ -337,6 +407,13 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
337407
}
338408

339409
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
410+
// Check if a GPU sampler has already sampled a token in which case we
411+
// return that token id directly.
412+
const llama_token gpu_sampled_token = llama_get_sampled_token_ith(ctx, idx);
413+
if (gpu_sampled_token != LLAMA_TOKEN_NULL) {
414+
return gpu_sampled_token;
415+
}
416+
340417
gsmpl->set_logits(ctx, idx);
341418

342419
auto & grmr = gsmpl->grmr;

common/sampling.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@ struct common_sampler;
3838

3939
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
4040

41+
// Create a GPU sampler chain from common sampling parameters
42+
// Returns a llama_sampler chain configured with GPU samplers based on the parameters
43+
// This chain can be used per-sequence for GPU-based sampling
44+
// Note: Only samplers that have GPU equivalents will be added to the chain
45+
// The returned sampler should be freed with llama_sampler_free()
46+
struct llama_sampler * common_sampler_gpu_init(const struct llama_model * model, const struct common_params_sampling & params);
47+
4148
void common_sampler_free(struct common_sampler * gsmpl);
4249

4350
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar

include/llama.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,30 @@ extern "C" {
13671367
//
13681368
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
13691369

1370+
//
1371+
// GPU samplers
1372+
//
1373+
1374+
/// @details Greedy sampling on GPU - always selects the token with the highest probability
1375+
LLAMA_API struct llama_sampler * llama_sampler_gpu_init_greedy(void);
1376+
1377+
/// @details Temperature scaling on GPU - scales logits by 1/temperature
1378+
LLAMA_API struct llama_sampler * llama_sampler_gpu_init_temp(float temp);
1379+
1380+
/// @details Softmax normalization on GPU - converts logits to probabilities
1381+
LLAMA_API struct llama_sampler * llama_sampler_gpu_init_softmax(void);
1382+
1383+
/// @details Top-K filtering on GPU - keeps only the k tokens with highest probabilities
1384+
LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_k(int32_t k);
1385+
1386+
/// @details Top-P approximation on GPU using top-k (not true top-p sampling)
1387+
/// This is an approximation that uses top-k to simulate top-p behavior
1388+
/// TODO: implement true top-p sampling on GPU
1389+
LLAMA_API struct llama_sampler * llama_sampler_gpu_init_top_p(int32_t k);
1390+
1391+
/// @details Distribution sampling on GPU - final sampling step that selects a token
1392+
LLAMA_API struct llama_sampler * llama_sampler_gpu_init_dist(uint32_t seed);
1393+
13701394
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
13711395
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
13721396

tools/main/main.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,13 @@ int main(int argc, char ** argv) {
135135

136136
std::vector<common_chat_msg> chat_msgs;
137137

138+
// Configure GPU Sampler
139+
std::vector<llama_sampler_seq_config> gpu_samplers = {
140+
{ 0, common_sampler_gpu_init(model, sparams) }
141+
};
142+
params.gpu_samplers = gpu_samplers.data();
143+
params.n_gpu_samplers = gpu_samplers.size();
144+
138145
// load the model and apply lora adapter, if any
139146
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
140147
common_init_result llama_init = common_init_from_params(params);

0 commit comments

Comments
 (0)