Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/src/pages/docs/desktop/llama-cpp.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,17 @@ These settings are for fine-tuning model behavior and advanced use cases:
| **Grammar File** | Constrain output format | Empty | For structured output (JSON, code, etc.) |
| **JSON Schema File** | Enforce JSON structure | Empty | When you need specific JSON formats |

### Mixture-of-Experts (MoE) Optimization

| Setting | What It Does | Default Value | When to Change |
|---------|-------------|---------------|----------------|
| **Keep All MoE Weights on CPU** | Offload all MoE expert weights to CPU | Disabled | For large MoE models when GPU memory is limited |
| **Keep First N MoE Layers on CPU** | Offload first N MoE layers to CPU | 0 (disabled) | Fine-tune memory usage for specific MoE models |

<Callout type="info">
These MoE settings are specifically for Mixture-of-Experts models (like Mixtral, Qwen2.5-MoE, etc.). They help reduce GPU memory usage by keeping expert weights on CPU while maintaining attention computations on GPU. Note: `--override-tensor` settings take precedence over these options.
</Callout>

## Troubleshooting Common Issues

**Models won't load:**
Expand Down
22 changes: 22 additions & 0 deletions extensions/llamacpp-extension/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -351,5 +351,27 @@
"placeholder": "path/to/schema.json",
"type": "text"
}
},
{
"key": "cpu_moe",
"title": "Keep All MoE Weights on CPU",
"description": "Keep all Mixture-of-Experts (MoE) weights on CPU. Useful for large MoE models to reduce GPU memory usage. Note: --override-tensor takes precedence over this setting.",
"controllerType": "checkbox",
"controllerProps": {
"value": false
}
},
{
"key": "n_cpu_moe",
"title": "Keep First N MoE Layers on CPU",
"description": "Keep the first N MoE layers on CPU (0 = disabled). Alternative to keeping all MoE weights on CPU. Useful for optimizing memory usage with large MoE models.",
"controllerType": "input",
"controllerProps": {
"value": 0,
"placeholder": "0 (disabled)",
"type": "number",
"textAlign": "right",
"min": 0
}
}
]
18 changes: 18 additions & 0 deletions extensions/llamacpp-extension/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ type LlamacppConfig = {
rope_freq_base: number
rope_freq_scale: number
ctx_shift: boolean
cpu_moe: boolean
n_cpu_moe: number
}

type ModelPlan = {
Expand Down Expand Up @@ -1636,6 +1638,22 @@ export default class llamacpp_extension extends AIEngine {
args.push('--no-mmap')
if (cfg.mlock) args.push('--mlock')
if (cfg.no_kv_offload) args.push('--no-kv-offload')

// MoE CPU offload flags - only add if backend supports them
// These flags were added in llama.cpp PR #15077
const backendVersion = Number(version.replace(/^b/, ''))
if (backendVersion >= 6325) {
if (cfg.cpu_moe) {
args.push('--cpu-moe')
} else if (cfg.n_cpu_moe > 0) {
args.push('--n-cpu-moe', String(cfg.n_cpu_moe))
}
} else {
if (cfg.cpu_moe || cfg.n_cpu_moe > 0) {
logger.warn('MoE CPU offload flags require llama.cpp backend version >= 6325. Current version:', version)
}
}

if (isEmbedding) {
args.push('--embedding')
args.push('--pooling mean')
Expand Down
196 changes: 193 additions & 3 deletions extensions/llamacpp-extension/src/test/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ describe('llamacpp_extension', () => {
vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))
vi.mocked(fs.existsSync).mockResolvedValue(false)
vi.mocked(fs.fileStat).mockResolvedValue({ size: 1000000 })
vi.mocked(fs.fileStat).mockResolvedValue({ size: 1000000, isDirectory: false })
vi.mocked(fs.mkdir).mockResolvedValue(undefined)
vi.mocked(invoke).mockResolvedValue(undefined)

Expand Down Expand Up @@ -213,9 +213,15 @@ describe('llamacpp_extension', () => {
rope_scale: 1.0,
rope_freq_base: 10000,
rope_freq_scale: 1.0,
reasoning_budget: 0,
auto_update_engine: false,
auto_unload: true
auto_unload: true,
cpu_moe: false,
n_cpu_moe: 0,
llamacpp_env: '',
memory_util: 'high',
offload_mmproj: true,
override_tensor_buffer_t: '',
ctx_shift: false
}

// Set up providerPath
Expand Down Expand Up @@ -261,6 +267,190 @@ describe('llamacpp_extension', () => {
api_key: 'test-api-key'
})
})

it('should add --cpu-moe flag when cpu_moe is enabled', async () => {
const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core')
const { invoke } = await import('@tauri-apps/api/core')

// Mock system info for getBackendExePath
const getSystemInfo = vi.fn().mockResolvedValue({
platform: 'win32',
arch: 'x64'
})

const { getBackendExePath } = await import('../backend')
vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe')

// Set up providerPath
extension['providerPath'] = '/path/to/jan/llamacpp'

// Mock config with cpu_moe enabled
extension['config'] = {
...extension['config'],
cpu_moe: true,
n_cpu_moe: 0,
version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325
}

vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))

// Mock model config
vi.mocked(invoke)
.mockResolvedValueOnce({ // read_yaml
model_path: 'test-model/model.gguf',
name: 'Test Model',
size_bytes: 1000000
})
.mockResolvedValueOnce('test-api-key') // generate_api_key
.mockResolvedValueOnce({ // load_llama_model
model_id: 'test-model',
pid: 123,
port: 3000,
api_key: 'test-api-key'
})

// Mock fetch for health check
vi.mocked(fetch).mockResolvedValue({
ok: true,
json: vi.fn().mockResolvedValue({ status: 'ok' })
} as any)

await extension.load('test-model')

// Verify that invoke was called with --cpu-moe flag
expect(invoke).toHaveBeenCalledWith(
'plugin:llamacpp|load_llama_model',
expect.any(String), // backend_path
expect.any(String), // library_path
expect.arrayContaining(['--cpu-moe']), // args should contain --cpu-moe
expect.any(Object) // envs
)
})

it('should add --n-cpu-moe flag when n_cpu_moe is set', async () => {
const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core')
const { invoke } = await import('@tauri-apps/api/core')

// Mock system info for getBackendExePath
const getSystemInfo = vi.fn().mockResolvedValue({
platform: 'win32',
arch: 'x64'
})

const { getBackendExePath } = await import('../backend')
vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe')

// Set up providerPath
extension['providerPath'] = '/path/to/jan/llamacpp'

// Mock config with n_cpu_moe set
extension['config'] = {
...extension['config'],
cpu_moe: false,
n_cpu_moe: 4,
version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325
}

vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))

// Mock model config
vi.mocked(invoke)
.mockResolvedValueOnce({ // read_yaml
model_path: 'test-model/model.gguf',
name: 'Test Model',
size_bytes: 1000000
})
.mockResolvedValueOnce('test-api-key') // generate_api_key
.mockResolvedValueOnce({ // load_llama_model
model_id: 'test-model',
pid: 123,
port: 3000,
api_key: 'test-api-key'
})

// Mock fetch for health check
vi.mocked(fetch).mockResolvedValue({
ok: true,
json: vi.fn().mockResolvedValue({ status: 'ok' })
} as any)

await extension.load('test-model')

// Verify that invoke was called with --n-cpu-moe flag
expect(invoke).toHaveBeenCalledWith(
'plugin:llamacpp|load_llama_model',
expect.any(String), // backend_path
expect.any(String), // library_path
expect.arrayContaining(['--n-cpu-moe', '4']), // args should contain --n-cpu-moe 4
expect.any(Object) // envs
)
})

it('should prefer --cpu-moe over --n-cpu-moe when both are set', async () => {
const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core')
const { invoke } = await import('@tauri-apps/api/core')

// Mock system info for getBackendExePath
const getSystemInfo = vi.fn().mockResolvedValue({
platform: 'win32',
arch: 'x64'
})

const { getBackendExePath } = await import('../backend')
vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe')

// Set up providerPath
extension['providerPath'] = '/path/to/jan/llamacpp'

// Mock config with both cpu_moe and n_cpu_moe set
extension['config'] = {
...extension['config'],
cpu_moe: true,
n_cpu_moe: 4,
version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325
}

vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))

// Mock model config
vi.mocked(invoke)
.mockResolvedValueOnce({ // read_yaml
model_path: 'test-model/model.gguf',
name: 'Test Model',
size_bytes: 1000000
})
.mockResolvedValueOnce('test-api-key') // generate_api_key
.mockResolvedValueOnce({ // load_llama_model
model_id: 'test-model',
pid: 123,
port: 3000,
api_key: 'test-api-key'
})

// Mock fetch for health check
vi.mocked(fetch).mockResolvedValue({
ok: true,
json: vi.fn().mockResolvedValue({ status: 'ok' })
} as any)

await extension.load('test-model')

// Verify that invoke was called with --cpu-moe flag (not --n-cpu-moe)
expect(invoke).toHaveBeenCalledWith(
'plugin:llamacpp|load_llama_model',
expect.any(String), // backend_path
expect.any(String), // library_path
expect.arrayContaining(['--cpu-moe']), // args should contain --cpu-moe
expect.any(Object) // envs
)

// Verify that --n-cpu-moe is NOT in the args
const callArgs = vi.mocked(invoke).mock.calls[2][2] as unknown as string[]
expect(callArgs).not.toContain('--n-cpu-moe')
})
})

describe('unload', () => {
Expand Down
Loading