Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c500461

Browse files
localai-botmudlerclaude
authored
feat(config): default prompt_cache_all to true (mudler#9951)
Upstream llama.cpp defaults `cache_prompt = true` (common/common.h), but `parse_options` in the grpc-server backend unconditionally forwards the proto `PromptCacheAll` field, so any model that didn't set `prompt_cache_all: true` in its YAML was getting `cache_prompt=false` — silently overriding llama.cpp's own default. With `kv_unified` and `cache_idle_slots` already on by default, this was the last piece preventing the per-request prompt cache from being usable out of the box. Make `PromptCacheAll` tristate (`*bool`), default it to `true` in `SetDefaults`, and dereference at the proto boundary. Users can still opt out with an explicit `prompt_cache_all: false`. Same pattern as `MMap`, `MMlock`, `Reranking`, etc. Co-authored-by: Ettore Di Giacinto <[email protected]> Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
1 parent 834ecc3 commit c500461

3 files changed

Lines changed: 41 additions & 2 deletions

File tree

core/backend/options.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
277277
MinP: float32(*c.MinP),
278278
Tokens: int32(*c.Maxtokens),
279279
Threads: int32(*c.Threads),
280-
PromptCacheAll: c.PromptCacheAll,
280+
PromptCacheAll: *c.PromptCacheAll,
281281
PromptCacheRO: c.PromptCacheRO,
282282
PromptCachePath: promptCachePath,
283283
F16KV: *c.F16,

core/config/hooks_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,4 +136,36 @@ var _ = Describe("Backend hooks and parser defaults", func() {
136136
Expect(cfg.EngineArgs["enable_chunked_prefill"]).To(Equal(true))
137137
})
138138
})
139+
140+
Context("PromptCacheAll default", func() {
141+
It("defaults to true when omitted from YAML", func() {
142+
cfg := &ModelConfig{}
143+
cfg.SetDefaults()
144+
145+
Expect(cfg.PromptCacheAll).NotTo(BeNil())
146+
Expect(*cfg.PromptCacheAll).To(BeTrue())
147+
})
148+
149+
It("preserves an explicit false from YAML", func() {
150+
falseV := false
151+
cfg := &ModelConfig{
152+
LLMConfig: LLMConfig{PromptCacheAll: &falseV},
153+
}
154+
cfg.SetDefaults()
155+
156+
Expect(cfg.PromptCacheAll).NotTo(BeNil())
157+
Expect(*cfg.PromptCacheAll).To(BeFalse())
158+
})
159+
160+
It("preserves an explicit true from YAML", func() {
161+
trueV := true
162+
cfg := &ModelConfig{
163+
LLMConfig: LLMConfig{PromptCacheAll: &trueV},
164+
}
165+
cfg.SetDefaults()
166+
167+
Expect(cfg.PromptCacheAll).NotTo(BeNil())
168+
Expect(*cfg.PromptCacheAll).To(BeTrue())
169+
})
170+
})
139171
})

core/config/model_config.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ type LLMConfig struct {
209209
RMSNormEps float32 `yaml:"rms_norm_eps,omitempty" json:"rms_norm_eps,omitempty"`
210210
NGQA int32 `yaml:"ngqa,omitempty" json:"ngqa,omitempty"`
211211
PromptCachePath string `yaml:"prompt_cache_path,omitempty" json:"prompt_cache_path,omitempty"`
212-
PromptCacheAll bool `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
212+
PromptCacheAll *bool `yaml:"prompt_cache_all,omitempty" json:"prompt_cache_all,omitempty"`
213213
PromptCacheRO bool `yaml:"prompt_cache_ro,omitempty" json:"prompt_cache_ro,omitempty"`
214214
MirostatETA *float64 `yaml:"mirostat_eta,omitempty" json:"mirostat_eta,omitempty"`
215215
MirostatTAU *float64 `yaml:"mirostat_tau,omitempty" json:"mirostat_tau,omitempty"`
@@ -494,6 +494,13 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
494494
cfg.Reranking = &falseV
495495
}
496496

497+
if cfg.PromptCacheAll == nil {
498+
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
499+
// and let cache_idle_slots / kv_unified actually do useful work; users can
500+
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
501+
cfg.PromptCacheAll = &trueV
502+
}
503+
497504
if threads == 0 {
498505
// Threads can't be 0
499506
threads = 4

0 commit comments

Comments
 (0)