From 6a88ad573eed97d8bb51c3af049899fdca91033d Mon Sep 17 00:00:00 2001 From: Roushan Singh Date: Thu, 2 Oct 2025 11:48:13 +0530 Subject: [PATCH] feat(llamacpp): add MoE CPU offload optimization settings - Add cpu_moe and n_cpu_moe configuration options for MoE models - Implement --cpu-moe and --n-cpu-moe flags with version compatibility checks - Update documentation with comprehensive MoE optimization section --- docs/src/pages/docs/desktop/llama-cpp.mdx | 11 + extensions/llamacpp-extension/settings.json | 22 ++ extensions/llamacpp-extension/src/index.ts | 18 ++ .../llamacpp-extension/src/test/index.test.ts | 196 +++++++++++++++++- 4 files changed, 244 insertions(+), 3 deletions(-) diff --git a/docs/src/pages/docs/desktop/llama-cpp.mdx b/docs/src/pages/docs/desktop/llama-cpp.mdx index 51aee542fc..421e766bb3 100644 --- a/docs/src/pages/docs/desktop/llama-cpp.mdx +++ b/docs/src/pages/docs/desktop/llama-cpp.mdx @@ -202,6 +202,17 @@ These settings are for fine-tuning model behavior and advanced use cases: | **Grammar File** | Constrain output format | Empty | For structured output (JSON, code, etc.) | | **JSON Schema File** | Enforce JSON structure | Empty | When you need specific JSON formats | +### Mixture-of-Experts (MoE) Optimization + +| Setting | What It Does | Default Value | When to Change | +|---------|-------------|---------------|----------------| +| **Keep All MoE Weights on CPU** | Offload all MoE expert weights to CPU | Disabled | For large MoE models when GPU memory is limited | +| **Keep First N MoE Layers on CPU** | Offload first N MoE layers to CPU | 0 (disabled) | Fine-tune memory usage for specific MoE models | + + +These MoE settings are specifically for Mixture-of-Experts models (like Mixtral, Qwen2.5-MoE, etc.). They help reduce GPU memory usage by keeping expert weights on CPU while maintaining attention computations on GPU. Note: `--override-tensor` settings take precedence over these options. + + ## Troubleshooting Common Issues **Models won't load:** diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json index ce5fc62e4b..a270c0733a 100644 --- a/extensions/llamacpp-extension/settings.json +++ b/extensions/llamacpp-extension/settings.json @@ -351,5 +351,27 @@ "placeholder": "path/to/schema.json", "type": "text" } + }, + { + "key": "cpu_moe", + "title": "Keep All MoE Weights on CPU", + "description": "Keep all Mixture-of-Experts (MoE) weights on CPU. Useful for large MoE models to reduce GPU memory usage. Note: --override-tensor takes precedence over this setting.", + "controllerType": "checkbox", + "controllerProps": { + "value": false + } + }, + { + "key": "n_cpu_moe", + "title": "Keep First N MoE Layers on CPU", + "description": "Keep the first N MoE layers on CPU (0 = disabled). Alternative to keeping all MoE weights on CPU. Useful for optimizing memory usage with large MoE models.", + "controllerType": "input", + "controllerProps": { + "value": 0, + "placeholder": "0 (disabled)", + "type": "number", + "textAlign": "right", + "min": 0 + } } ] diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 07d49cd536..84731861c8 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -81,6 +81,8 @@ type LlamacppConfig = { rope_freq_base: number rope_freq_scale: number ctx_shift: boolean + cpu_moe: boolean + n_cpu_moe: number } type ModelPlan = { @@ -1629,6 +1631,22 @@ export default class llamacpp_extension extends AIEngine { args.push('--no-mmap') if (cfg.mlock) args.push('--mlock') if (cfg.no_kv_offload) args.push('--no-kv-offload') + + // MoE CPU offload flags - only add if backend supports them + // These flags were added in llama.cpp PR #15077 + const backendVersion = Number(version.replace(/^b/, '')) + if (backendVersion >= 6325) { + if (cfg.cpu_moe) { + args.push('--cpu-moe') + } else if (cfg.n_cpu_moe > 0) { + args.push('--n-cpu-moe', String(cfg.n_cpu_moe)) + } + } else { + if (cfg.cpu_moe || cfg.n_cpu_moe > 0) { + logger.warn('MoE CPU offload flags require llama.cpp backend version >= 6325. Current version:', version) + } + } + if (isEmbedding) { args.push('--embedding') args.push('--pooling mean') diff --git a/extensions/llamacpp-extension/src/test/index.test.ts b/extensions/llamacpp-extension/src/test/index.test.ts index 59090c7bf4..7c9307f8c5 100644 --- a/extensions/llamacpp-extension/src/test/index.test.ts +++ b/extensions/llamacpp-extension/src/test/index.test.ts @@ -143,7 +143,7 @@ describe('llamacpp_extension', () => { vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan') vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/'))) vi.mocked(fs.existsSync).mockResolvedValue(false) - vi.mocked(fs.fileStat).mockResolvedValue({ size: 1000000 }) + vi.mocked(fs.fileStat).mockResolvedValue({ size: 1000000, isDirectory: false }) vi.mocked(fs.mkdir).mockResolvedValue(undefined) vi.mocked(invoke).mockResolvedValue(undefined) @@ -213,9 +213,15 @@ describe('llamacpp_extension', () => { rope_scale: 1.0, rope_freq_base: 10000, rope_freq_scale: 1.0, - reasoning_budget: 0, auto_update_engine: false, - auto_unload: true + auto_unload: true, + cpu_moe: false, + n_cpu_moe: 0, + llamacpp_env: '', + memory_util: 'high', + offload_mmproj: true, + override_tensor_buffer_t: '', + ctx_shift: false } // Set up providerPath @@ -261,6 +267,190 @@ describe('llamacpp_extension', () => { api_key: 'test-api-key' }) }) + + it('should add --cpu-moe flag when cpu_moe is enabled', async () => { + const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core') + const { invoke } = await import('@tauri-apps/api/core') + + // Mock system info for getBackendExePath + const getSystemInfo = vi.fn().mockResolvedValue({ + platform: 'win32', + arch: 'x64' + }) + + const { getBackendExePath } = await import('../backend') + vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe') + + // Set up providerPath + extension['providerPath'] = '/path/to/jan/llamacpp' + + // Mock config with cpu_moe enabled + extension['config'] = { + ...extension['config'], + cpu_moe: true, + n_cpu_moe: 0, + version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325 + } + + vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan') + vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/'))) + + // Mock model config + vi.mocked(invoke) + .mockResolvedValueOnce({ // read_yaml + model_path: 'test-model/model.gguf', + name: 'Test Model', + size_bytes: 1000000 + }) + .mockResolvedValueOnce('test-api-key') // generate_api_key + .mockResolvedValueOnce({ // load_llama_model + model_id: 'test-model', + pid: 123, + port: 3000, + api_key: 'test-api-key' + }) + + // Mock fetch for health check + vi.mocked(fetch).mockResolvedValue({ + ok: true, + json: vi.fn().mockResolvedValue({ status: 'ok' }) + } as any) + + await extension.load('test-model') + + // Verify that invoke was called with --cpu-moe flag + expect(invoke).toHaveBeenCalledWith( + 'plugin:llamacpp|load_llama_model', + expect.any(String), // backend_path + expect.any(String), // library_path + expect.arrayContaining(['--cpu-moe']), // args should contain --cpu-moe + expect.any(Object) // envs + ) + }) + + it('should add --n-cpu-moe flag when n_cpu_moe is set', async () => { + const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core') + const { invoke } = await import('@tauri-apps/api/core') + + // Mock system info for getBackendExePath + const getSystemInfo = vi.fn().mockResolvedValue({ + platform: 'win32', + arch: 'x64' + }) + + const { getBackendExePath } = await import('../backend') + vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe') + + // Set up providerPath + extension['providerPath'] = '/path/to/jan/llamacpp' + + // Mock config with n_cpu_moe set + extension['config'] = { + ...extension['config'], + cpu_moe: false, + n_cpu_moe: 4, + version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325 + } + + vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan') + vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/'))) + + // Mock model config + vi.mocked(invoke) + .mockResolvedValueOnce({ // read_yaml + model_path: 'test-model/model.gguf', + name: 'Test Model', + size_bytes: 1000000 + }) + .mockResolvedValueOnce('test-api-key') // generate_api_key + .mockResolvedValueOnce({ // load_llama_model + model_id: 'test-model', + pid: 123, + port: 3000, + api_key: 'test-api-key' + }) + + // Mock fetch for health check + vi.mocked(fetch).mockResolvedValue({ + ok: true, + json: vi.fn().mockResolvedValue({ status: 'ok' }) + } as any) + + await extension.load('test-model') + + // Verify that invoke was called with --n-cpu-moe flag + expect(invoke).toHaveBeenCalledWith( + 'plugin:llamacpp|load_llama_model', + expect.any(String), // backend_path + expect.any(String), // library_path + expect.arrayContaining(['--n-cpu-moe', '4']), // args should contain --n-cpu-moe 4 + expect.any(Object) // envs + ) + }) + + it('should prefer --cpu-moe over --n-cpu-moe when both are set', async () => { + const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core') + const { invoke } = await import('@tauri-apps/api/core') + + // Mock system info for getBackendExePath + const getSystemInfo = vi.fn().mockResolvedValue({ + platform: 'win32', + arch: 'x64' + }) + + const { getBackendExePath } = await import('../backend') + vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe') + + // Set up providerPath + extension['providerPath'] = '/path/to/jan/llamacpp' + + // Mock config with both cpu_moe and n_cpu_moe set + extension['config'] = { + ...extension['config'], + cpu_moe: true, + n_cpu_moe: 4, + version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325 + } + + vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan') + vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/'))) + + // Mock model config + vi.mocked(invoke) + .mockResolvedValueOnce({ // read_yaml + model_path: 'test-model/model.gguf', + name: 'Test Model', + size_bytes: 1000000 + }) + .mockResolvedValueOnce('test-api-key') // generate_api_key + .mockResolvedValueOnce({ // load_llama_model + model_id: 'test-model', + pid: 123, + port: 3000, + api_key: 'test-api-key' + }) + + // Mock fetch for health check + vi.mocked(fetch).mockResolvedValue({ + ok: true, + json: vi.fn().mockResolvedValue({ status: 'ok' }) + } as any) + + await extension.load('test-model') + + // Verify that invoke was called with --cpu-moe flag (not --n-cpu-moe) + expect(invoke).toHaveBeenCalledWith( + 'plugin:llamacpp|load_llama_model', + expect.any(String), // backend_path + expect.any(String), // library_path + expect.arrayContaining(['--cpu-moe']), // args should contain --cpu-moe + expect.any(Object) // envs + ) + + // Verify that --n-cpu-moe is NOT in the args + const callArgs = vi.mocked(invoke).mock.calls[2][2] as unknown as string[] + expect(callArgs).not.toContain('--n-cpu-moe') + }) }) describe('unload', () => {