@@ -2482,6 +2482,7 @@ struct llama_cparams {
2482
2482
bool causal_attn;
2483
2483
bool offload_kqv;
2484
2484
bool flash_attn;
2485
+ bool no_perf;
2485
2486
2486
2487
enum llama_pooling_type pooling_type;
2487
2488
@@ -6647,8 +6648,6 @@ static bool llm_load_tensors(
6647
6648
bool use_mlock,
6648
6649
llama_progress_callback progress_callback,
6649
6650
void * progress_callback_user_data) {
6650
- model.t_start_us = ggml_time_us();
6651
-
6652
6651
auto & hparams = model.hparams;
6653
6652
6654
6653
model.split_mode = split_mode;
@@ -8579,14 +8578,13 @@ static bool llm_load_tensors(
8579
8578
}
8580
8579
}
8581
8580
8582
- // loading time will be recalculate after the first eval, so
8583
- // we take page faults deferred by mmap() into consideration
8584
- model.t_load_us = ggml_time_us() - model.t_start_us;
8585
8581
return true;
8586
8582
}
8587
8583
8588
8584
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
8589
8585
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
8586
+ model.t_start_us = ggml_time_us();
8587
+
8590
8588
try {
8591
8589
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
8592
8590
@@ -8648,6 +8646,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
8648
8646
return -1;
8649
8647
}
8650
8648
8649
+ // loading time will be recalculate after the first eval, so
8650
+ // we take page faults deferred by mmap() into consideration
8651
+ model.t_load_us = ggml_time_us() - model.t_start_us;
8652
+
8651
8653
return 0;
8652
8654
}
8653
8655
@@ -17915,6 +17917,7 @@ struct llama_context_params llama_context_default_params() {
17915
17917
/*.embeddings =*/ false,
17916
17918
/*.offload_kqv =*/ true,
17917
17919
/*.flash_attn =*/ false,
17920
+ /*.no_perf =*/ true,
17918
17921
/*.abort_callback =*/ nullptr,
17919
17922
/*.abort_callback_data =*/ nullptr,
17920
17923
};
@@ -18125,6 +18128,7 @@ struct llama_context * llama_new_context_with_model(
18125
18128
cparams.embeddings = params.embeddings;
18126
18129
cparams.offload_kqv = params.offload_kqv;
18127
18130
cparams.flash_attn = params.flash_attn;
18131
+ cparams.no_perf = params.no_perf;
18128
18132
cparams.pooling_type = params.pooling_type;
18129
18133
18130
18134
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -20043,10 +20047,14 @@ void llama_synchronize(struct llama_context * ctx) {
20043
20047
20044
20048
// add the evaluation to the stats
20045
20049
if (ctx->n_queued_tokens == 1) {
20046
- ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20050
+ if (!ctx->cparams.no_perf) {
20051
+ ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20052
+ }
20047
20053
ctx->n_eval++;
20048
20054
} else if (ctx->n_queued_tokens > 1) {
20049
- ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20055
+ if (!ctx->cparams.no_perf) {
20056
+ ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20057
+ }
20050
20058
ctx->n_p_eval += ctx->n_queued_tokens;
20051
20059
}
20052
20060
@@ -20653,39 +20661,61 @@ const char * llama_print_system_info(void) {
20653
20661
return s.c_str();
20654
20662
}
20655
20663
20656
- void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20664
+ llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
20665
+ llama_perf_data data = {};
20666
+
20667
+ if (ctx == nullptr) {
20668
+ return data;
20669
+ }
20670
+
20657
20671
switch (type) {
20658
20672
case LLAMA_PERF_TYPE_CONTEXT:
20659
20673
{
20660
20674
const auto * p = (const struct llama_context *) ctx;
20661
20675
20662
- const double t_start_ms = 1e-3 * p->t_start_us;
20663
- const double t_end_ms = 1.00 * ggml_time_ms();
20664
- const double t_load_ms = 1e-3 * p->t_load_us;
20665
- const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20666
- const double t_eval_ms = 1e-3 * p->t_eval_us;
20676
+ data.t_start_ms = 1e-3 * p->t_start_us;
20677
+ data.t_load_ms = 1e-3 * p->t_load_us;;
20678
+ data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20679
+ data.t_eval_ms = 1e-3 * p->t_eval_us;
20680
+ data.n_p_eval = std::max(1, p->n_p_eval);
20681
+ data.n_eval = std::max(1, p->n_eval);
20682
+ } break;
20683
+ case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20684
+ {
20685
+ const auto * smpl = (const struct llama_sampler *) ctx;
20686
+ const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20667
20687
20668
- const int32_t n_p_eval = std::max(0, p->n_p_eval);
20669
- const int32_t n_eval = std::max(1, p->n_eval);
20688
+ data.t_sample_ms = 1e-3 * p->t_sample_us;
20689
+ data.n_sample = std::max(0, p->n_sample);
20690
+ } break;
20691
+ default:
20692
+ GGML_ABORT("invalid perf type");
20693
+ }
20694
+
20695
+ return data;
20696
+ }
20670
20697
20671
- LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
20698
+ void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20699
+ switch (type) {
20700
+ case LLAMA_PERF_TYPE_CONTEXT:
20701
+ {
20702
+ const auto data = llama_perf_get(ctx, type);
20703
+
20704
+ const double t_end_ms = 1e-3 * ggml_time_us();
20705
+
20706
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
20672
20707
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20673
- __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
20708
+ __func__, data. t_p_eval_ms, data. n_p_eval, data. t_p_eval_ms / data. n_p_eval, 1e3 / data. t_p_eval_ms * data. n_p_eval);
20674
20709
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20675
- __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
20676
- LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
20710
+ __func__, data. t_eval_ms, data. n_eval, data. t_eval_ms / data. n_eval, 1e3 / data. t_eval_ms * data. n_eval);
20711
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data. t_start_ms), (data. n_p_eval + data. n_eval));
20677
20712
} break;
20678
20713
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20679
20714
{
20680
- const auto * smpl = (const struct llama_sampler *) ctx;
20681
- const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20682
-
20683
- const double t_sampler_ms = 1e-3 * p->t_sample_us;
20684
-
20685
- const int32_t n_sampler = std::max(0, p->n_sample);
20715
+ const auto data = llama_perf_get(ctx, type);
20686
20716
20687
20717
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20688
- __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler , 1e3 / t_sampler_ms * n_sampler );
20718
+ __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample , 1e3 / data.t_sample_ms * data.n_sample );
20689
20719
} break;
20690
20720
default:
20691
20721
GGML_ABORT("invalid perf type");
@@ -20705,7 +20735,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
20705
20735
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20706
20736
{
20707
20737
auto * smpl = (struct llama_sampler *) ctx;
20708
- auto * p = (struct llama_sampler_chain *) smpl->ctx;
20738
+ auto * p = (struct llama_sampler_chain *) smpl->ctx;
20709
20739
20710
20740
p->t_sample_us = p->n_sample = 0;
20711
20741
} break;
0 commit comments