From 6a88ad573eed97d8bb51c3af049899fdca91033d Mon Sep 17 00:00:00 2001
From: Roushan Singh <github.rtron18@gmail.com>
Date: Thu, 2 Oct 2025 11:48:13 +0530
Subject: [PATCH] feat(llamacpp): add MoE CPU offload optimization settings

- Add cpu_moe and n_cpu_moe configuration options for MoE models
- Implement --cpu-moe and --n-cpu-moe flags with version compatibility checks
- Update documentation with comprehensive MoE optimization section
---
 docs/src/pages/docs/desktop/llama-cpp.mdx     |  11 +
 extensions/llamacpp-extension/settings.json   |  22 ++
 extensions/llamacpp-extension/src/index.ts    |  18 ++
 .../llamacpp-extension/src/test/index.test.ts | 196 +++++++++++++++++-
 4 files changed, 244 insertions(+), 3 deletions(-)

diff --git a/docs/src/pages/docs/desktop/llama-cpp.mdx b/docs/src/pages/docs/desktop/llama-cpp.mdx
index 51aee542fc..421e766bb3 100644
--- a/docs/src/pages/docs/desktop/llama-cpp.mdx
+++ b/docs/src/pages/docs/desktop/llama-cpp.mdx
@@ -202,6 +202,17 @@ These settings are for fine-tuning model behavior and advanced use cases:
 | **Grammar File** | Constrain output format | Empty | For structured output (JSON, code, etc.) |
 | **JSON Schema File** | Enforce JSON structure | Empty | When you need specific JSON formats |
 
+### Mixture-of-Experts (MoE) Optimization
+
+| Setting | What It Does | Default Value | When to Change |
+|---------|-------------|---------------|----------------|
+| **Keep All MoE Weights on CPU** | Offload all MoE expert weights to CPU | Disabled | For large MoE models when GPU memory is limited |
+| **Keep First N MoE Layers on CPU** | Offload first N MoE layers to CPU | 0 (disabled) | Fine-tune memory usage for specific MoE models |
+
+<Callout type="info">
+These MoE settings are specifically for Mixture-of-Experts models (like Mixtral, Qwen2.5-MoE, etc.). They help reduce GPU memory usage by keeping expert weights on CPU while maintaining attention computations on GPU. Note: `--override-tensor` settings take precedence over these options.
+</Callout>
+
 ## Troubleshooting Common Issues
 
 **Models won't load:**
diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json
index ce5fc62e4b..a270c0733a 100644
--- a/extensions/llamacpp-extension/settings.json
+++ b/extensions/llamacpp-extension/settings.json
@@ -351,5 +351,27 @@
       "placeholder": "path/to/schema.json",
       "type": "text"
     }
+  },
+  {
+    "key": "cpu_moe",
+    "title": "Keep All MoE Weights on CPU",
+    "description": "Keep all Mixture-of-Experts (MoE) weights on CPU. Useful for large MoE models to reduce GPU memory usage. Note: --override-tensor takes precedence over this setting.",
+    "controllerType": "checkbox",
+    "controllerProps": {
+      "value": false
+    }
+  },
+  {
+    "key": "n_cpu_moe",
+    "title": "Keep First N MoE Layers on CPU",
+    "description": "Keep the first N MoE layers on CPU (0 = disabled). Alternative to keeping all MoE weights on CPU. Useful for optimizing memory usage with large MoE models.",
+    "controllerType": "input",
+    "controllerProps": {
+      "value": 0,
+      "placeholder": "0 (disabled)",
+      "type": "number",
+      "textAlign": "right",
+      "min": 0
+    }
   }
 ]
diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
index 07d49cd536..84731861c8 100644
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@@ -81,6 +81,8 @@ type LlamacppConfig = {
   rope_freq_base: number
   rope_freq_scale: number
   ctx_shift: boolean
+  cpu_moe: boolean
+  n_cpu_moe: number
 }
 
 type ModelPlan = {
@@ -1629,6 +1631,22 @@ export default class llamacpp_extension extends AIEngine {
     args.push('--no-mmap')
     if (cfg.mlock) args.push('--mlock')
     if (cfg.no_kv_offload) args.push('--no-kv-offload')
+    
+    // MoE CPU offload flags - only add if backend supports them
+    // These flags were added in llama.cpp PR #15077
+    const backendVersion = Number(version.replace(/^b/, ''))
+    if (backendVersion >= 6325) {
+      if (cfg.cpu_moe) {
+        args.push('--cpu-moe')
+      } else if (cfg.n_cpu_moe > 0) {
+        args.push('--n-cpu-moe', String(cfg.n_cpu_moe))
+      }
+    } else {
+      if (cfg.cpu_moe || cfg.n_cpu_moe > 0) {
+        logger.warn('MoE CPU offload flags require llama.cpp backend version >= 6325. Current version:', version)
+      }
+    }
+    
     if (isEmbedding) {
       args.push('--embedding')
       args.push('--pooling mean')
diff --git a/extensions/llamacpp-extension/src/test/index.test.ts b/extensions/llamacpp-extension/src/test/index.test.ts
index 59090c7bf4..7c9307f8c5 100644
--- a/extensions/llamacpp-extension/src/test/index.test.ts
+++ b/extensions/llamacpp-extension/src/test/index.test.ts
@@ -143,7 +143,7 @@ describe('llamacpp_extension', () => {
       vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
       vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))
       vi.mocked(fs.existsSync).mockResolvedValue(false)
-      vi.mocked(fs.fileStat).mockResolvedValue({ size: 1000000 })
+      vi.mocked(fs.fileStat).mockResolvedValue({ size: 1000000, isDirectory: false })
       vi.mocked(fs.mkdir).mockResolvedValue(undefined)
       vi.mocked(invoke).mockResolvedValue(undefined)
 
@@ -213,9 +213,15 @@ describe('llamacpp_extension', () => {
         rope_scale: 1.0,
         rope_freq_base: 10000,
         rope_freq_scale: 1.0,
-        reasoning_budget: 0,
         auto_update_engine: false,
-        auto_unload: true
+        auto_unload: true,
+        cpu_moe: false,
+        n_cpu_moe: 0,
+        llamacpp_env: '',
+        memory_util: 'high',
+        offload_mmproj: true,
+        override_tensor_buffer_t: '',
+        ctx_shift: false
       }
       
       // Set up providerPath
@@ -261,6 +267,190 @@ describe('llamacpp_extension', () => {
         api_key: 'test-api-key'
       })
     })
+
+    it('should add --cpu-moe flag when cpu_moe is enabled', async () => {
+      const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core')
+      const { invoke } = await import('@tauri-apps/api/core')
+      
+      // Mock system info for getBackendExePath
+      const getSystemInfo = vi.fn().mockResolvedValue({
+        platform: 'win32',
+        arch: 'x64'
+      })
+      
+      const { getBackendExePath } = await import('../backend')
+      vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe')
+      
+      // Set up providerPath
+      extension['providerPath'] = '/path/to/jan/llamacpp'
+      
+      // Mock config with cpu_moe enabled
+      extension['config'] = {
+        ...extension['config'],
+        cpu_moe: true,
+        n_cpu_moe: 0,
+        version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325
+      }
+      
+      vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
+      vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))
+      
+      // Mock model config
+      vi.mocked(invoke)
+        .mockResolvedValueOnce({ // read_yaml
+          model_path: 'test-model/model.gguf',
+          name: 'Test Model',
+          size_bytes: 1000000
+        })
+        .mockResolvedValueOnce('test-api-key') // generate_api_key
+        .mockResolvedValueOnce({ // load_llama_model
+          model_id: 'test-model',
+          pid: 123,
+          port: 3000,
+          api_key: 'test-api-key'
+        })
+      
+      // Mock fetch for health check
+      vi.mocked(fetch).mockResolvedValue({
+        ok: true,
+        json: vi.fn().mockResolvedValue({ status: 'ok' })
+      } as any)
+
+      await extension.load('test-model')
+      
+      // Verify that invoke was called with --cpu-moe flag
+      expect(invoke).toHaveBeenCalledWith(
+        'plugin:llamacpp|load_llama_model',
+        expect.any(String), // backend_path
+        expect.any(String), // library_path
+        expect.arrayContaining(['--cpu-moe']), // args should contain --cpu-moe
+        expect.any(Object) // envs
+      )
+    })
+
+    it('should add --n-cpu-moe flag when n_cpu_moe is set', async () => {
+      const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core')
+      const { invoke } = await import('@tauri-apps/api/core')
+      
+      // Mock system info for getBackendExePath
+      const getSystemInfo = vi.fn().mockResolvedValue({
+        platform: 'win32',
+        arch: 'x64'
+      })
+      
+      const { getBackendExePath } = await import('../backend')
+      vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe')
+      
+      // Set up providerPath
+      extension['providerPath'] = '/path/to/jan/llamacpp'
+      
+      // Mock config with n_cpu_moe set
+      extension['config'] = {
+        ...extension['config'],
+        cpu_moe: false,
+        n_cpu_moe: 4,
+        version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325
+      }
+      
+      vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
+      vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))
+      
+      // Mock model config
+      vi.mocked(invoke)
+        .mockResolvedValueOnce({ // read_yaml
+          model_path: 'test-model/model.gguf',
+          name: 'Test Model',
+          size_bytes: 1000000
+        })
+        .mockResolvedValueOnce('test-api-key') // generate_api_key
+        .mockResolvedValueOnce({ // load_llama_model
+          model_id: 'test-model',
+          pid: 123,
+          port: 3000,
+          api_key: 'test-api-key'
+        })
+      
+      // Mock fetch for health check
+      vi.mocked(fetch).mockResolvedValue({
+        ok: true,
+        json: vi.fn().mockResolvedValue({ status: 'ok' })
+      } as any)
+
+      await extension.load('test-model')
+      
+      // Verify that invoke was called with --n-cpu-moe flag
+      expect(invoke).toHaveBeenCalledWith(
+        'plugin:llamacpp|load_llama_model',
+        expect.any(String), // backend_path
+        expect.any(String), // library_path
+        expect.arrayContaining(['--n-cpu-moe', '4']), // args should contain --n-cpu-moe 4
+        expect.any(Object) // envs
+      )
+    })
+
+    it('should prefer --cpu-moe over --n-cpu-moe when both are set', async () => {
+      const { getJanDataFolderPath, joinPath, fs } = await import('@janhq/core')
+      const { invoke } = await import('@tauri-apps/api/core')
+      
+      // Mock system info for getBackendExePath
+      const getSystemInfo = vi.fn().mockResolvedValue({
+        platform: 'win32',
+        arch: 'x64'
+      })
+      
+      const { getBackendExePath } = await import('../backend')
+      vi.mocked(getBackendExePath).mockResolvedValue('/path/to/llama-server.exe')
+      
+      // Set up providerPath
+      extension['providerPath'] = '/path/to/jan/llamacpp'
+      
+      // Mock config with both cpu_moe and n_cpu_moe set
+      extension['config'] = {
+        ...extension['config'],
+        cpu_moe: true,
+        n_cpu_moe: 4,
+        version_backend: 'v1.0.0/win-avx2-x64' // version >= 6325
+      }
+      
+      vi.mocked(getJanDataFolderPath).mockResolvedValue('/path/to/jan')
+      vi.mocked(joinPath).mockImplementation((paths) => Promise.resolve(paths.join('/')))
+      
+      // Mock model config
+      vi.mocked(invoke)
+        .mockResolvedValueOnce({ // read_yaml
+          model_path: 'test-model/model.gguf',
+          name: 'Test Model',
+          size_bytes: 1000000
+        })
+        .mockResolvedValueOnce('test-api-key') // generate_api_key
+        .mockResolvedValueOnce({ // load_llama_model
+          model_id: 'test-model',
+          pid: 123,
+          port: 3000,
+          api_key: 'test-api-key'
+        })
+      
+      // Mock fetch for health check
+      vi.mocked(fetch).mockResolvedValue({
+        ok: true,
+        json: vi.fn().mockResolvedValue({ status: 'ok' })
+      } as any)
+
+      await extension.load('test-model')
+      
+      // Verify that invoke was called with --cpu-moe flag (not --n-cpu-moe)
+      expect(invoke).toHaveBeenCalledWith(
+        'plugin:llamacpp|load_llama_model',
+        expect.any(String), // backend_path
+        expect.any(String), // library_path
+        expect.arrayContaining(['--cpu-moe']), // args should contain --cpu-moe
+        expect.any(Object) // envs
+      )
+      
+      // Verify that --n-cpu-moe is NOT in the args
+      const callArgs = vi.mocked(invoke).mock.calls[2][2] as unknown as string[]
+      expect(callArgs).not.toContain('--n-cpu-moe')
+    })
   })
 
   describe('unload', () => {