diff --git a/.github/patches/windows/msvcp140.dll b/.github/patches/windows/msvcp140.dll
index f999742d9..d3d103ee0 100644
Binary files a/.github/patches/windows/msvcp140.dll and b/.github/patches/windows/msvcp140.dll differ
diff --git a/.github/patches/windows/vcruntime140.dll b/.github/patches/windows/vcruntime140.dll
index 3a4aded20..8edab904f 100644
Binary files a/.github/patches/windows/vcruntime140.dll and b/.github/patches/windows/vcruntime140.dll differ
diff --git a/.github/patches/windows/vcruntime140_1.dll b/.github/patches/windows/vcruntime140_1.dll
index 3ebabdee6..2ef481dbf 100644
Binary files a/.github/patches/windows/vcruntime140_1.dll and b/.github/patches/windows/vcruntime140_1.dll differ
diff --git a/.github/workflows/beta-build.yml b/.github/workflows/beta-build.yml
index 1bf324d96..1d5480312 100644
--- a/.github/workflows/beta-build.yml
+++ b/.github/workflows/beta-build.yml
@@ -9,7 +9,7 @@ jobs:
   get-update-version:
     uses: ./.github/workflows/template-get-update-version.yml
 
-  get-cortex-llamacpp-latest-version:
+  get-llamacpp-latest-version:
     uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml
 
   create-draft-release:
@@ -39,7 +39,7 @@ jobs:
 
   build-macos:
     uses: ./.github/workflows/template-build-macos.yml
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     secrets: inherit
     with:
       ref: ${{ github.ref }}
@@ -48,12 +48,12 @@ jobs:
       cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: beta
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
 
   build-windows-x64:
     uses: ./.github/workflows/template-build-windows-x64.yml
     secrets: inherit
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       public_provider: github
@@ -64,12 +64,12 @@ jobs:
       ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
       channel: beta
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
           
   build-linux-x64:
     uses: ./.github/workflows/template-build-linux.yml
     secrets: inherit
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       public_provider: github
@@ -78,13 +78,13 @@ jobs:
       cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: beta
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
       arch: amd64
 
   build-linux-arm64:
     uses: ./.github/workflows/template-build-linux.yml
     secrets: inherit
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       public_provider: github
@@ -93,13 +93,13 @@ jobs:
       cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: beta
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
       arch: arm64
 
   build-docker-x64:
     uses: ./.github/workflows/template-build-docker-x64.yml
     secrets: inherit
-    needs: [get-update-version, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       new_version: ${{ needs.get-update-version.outputs.new_version }}
diff --git a/.github/workflows/cortex-cpp-quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml
index 279dd77d6..fc2d52b63 100644
--- a/.github/workflows/cortex-cpp-quality-gate.yml
+++ b/.github/workflows/cortex-cpp-quality-gate.yml
@@ -150,6 +150,7 @@ jobs:
         run: |
           cd engine
           mkdir -p ~/.config/cortexcpp/
+          mkdir -p ~/.local/share/cortexcpp/
           echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" > ~/.config/cortexcpp/.cortexrc
           echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.config/cortexcpp/.cortexrc
           # ./build/cortex
@@ -177,6 +178,7 @@ jobs:
         run: |
           cd engine
           mkdir -p ~/.config/cortexcpp/
+          mkdir -p ~/.local/share/cortexcpp/
           echo "apiServerPort: 3928" > ~/.config/cortexcpp/.cortexrc
           echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" >> ~/.config/cortexcpp/.cortexrc
           echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.config/cortexcpp/.cortexrc
@@ -456,6 +458,7 @@ jobs:
         run: |
           cd engine
           mkdir -p ~/.config/cortexcpp/
+          mkdir -p ~/.local/share/cortexcpp/
           echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.config/cortexcpp/.cortexrc
           # ./build/cortex
           cat ~/.config/cortexcpp/.cortexrc
@@ -481,6 +484,7 @@ jobs:
         run: |
           cd engine
           mkdir -p ~/.config/cortexcpp/
+          mkdir -p ~/.local/share/cortexcpp/
           echo "apiServerPort: 3928" > ~/.config/cortexcpp/.cortexrc
           echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.config/cortexcpp/.cortexrc
           # ./build/cortex
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index 1f076dc97..efdbfdf6f 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -43,12 +43,12 @@ jobs:
   get-update-version:
     uses: ./.github/workflows/template-get-update-version.yml
 
-  get-cortex-llamacpp-latest-version:
+  get-llamacpp-latest-version:
     uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml
 
   build-macos:
     uses: ./.github/workflows/template-build-macos.yml
-    needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, set-public-provider, get-llamacpp-latest-version]
     secrets: inherit
     with:
       ref: ${{ needs.set-public-provider.outputs.ref }}
@@ -56,12 +56,12 @@ jobs:
       new_version: ${{ needs.get-update-version.outputs.new_version }}
       cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: nightly
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
 
   build-windows-x64:
     uses: ./.github/workflows/template-build-windows-x64.yml
     secrets: inherit
-    needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, set-public-provider, get-llamacpp-latest-version]
     with:
       ref: ${{ needs.set-public-provider.outputs.ref }}
       public_provider: ${{ needs.set-public-provider.outputs.public_provider }}
@@ -71,12 +71,12 @@ jobs:
       build-deps-cmake-flags: "-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja"
       ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
       channel: nightly
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
           
   build-linux-x64:
     uses: ./.github/workflows/template-build-linux.yml
     secrets: inherit
-    needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, set-public-provider, get-llamacpp-latest-version]
     with:
       ref: ${{ needs.set-public-provider.outputs.ref }}
       public_provider: ${{ needs.set-public-provider.outputs.public_provider }}
@@ -84,13 +84,13 @@ jobs:
       runs-on: ubuntu-20-04
       cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: nightly
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
       arch: amd64
 
   build-linux-arm64:
     uses: ./.github/workflows/template-build-linux.yml
     secrets: inherit
-    needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, set-public-provider, get-llamacpp-latest-version]
     with:
       ref: ${{ needs.set-public-provider.outputs.ref }}
       public_provider: ${{ needs.set-public-provider.outputs.public_provider }}
@@ -98,13 +98,13 @@ jobs:
       runs-on: ubuntu-2004-arm64
       cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: nightly
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
       arch: arm64
 
   update-latest-version:
     runs-on: ubuntu-latest
     if: needs.set-public-provider.outputs.public_provider == 'aws-s3'
-    needs: [get-update-version, set-public-provider, build-linux-x64, build-linux-arm64, build-macos, build-windows-x64, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, set-public-provider, build-linux-x64, build-linux-arm64, build-macos, build-windows-x64, get-llamacpp-latest-version]
     steps:
       - name: Update latest version
         id: update-latest-version
@@ -132,7 +132,7 @@ jobs:
     if: needs.set-public-provider.outputs.public_provider == 'aws-s3'
     uses: ./.github/workflows/template-build-docker-x64.yml
     secrets: inherit
-    needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version]
+    needs: [get-update-version, set-public-provider, get-llamacpp-latest-version, update-latest-version]
     with:
       ref: ${{ needs.set-public-provider.outputs.ref }}
       new_version: nightly-${{ needs.get-update-version.outputs.new_version }}
@@ -141,7 +141,7 @@ jobs:
       tags: menloltd/cortex:nightly-${{ needs.get-update-version.outputs.new_version }}
 
   noti-discord-nightly-and-update-url-readme:
-    needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version, build-docker-x64]
+    needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-llamacpp-latest-version, update-latest-version, build-docker-x64]
     secrets: inherit
     if: github.event_name == 'schedule'
     uses: ./.github/workflows/template-noti-discord.yaml
@@ -150,7 +150,7 @@ jobs:
       new_version: ${{ needs.get-update-version.outputs.new_version }}
 
   noti-discord-manual:
-    needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, build-docker-x64]
+    needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-llamacpp-latest-version, build-docker-x64]
     secrets: inherit
     if: github.event_name == 'workflow_dispatch' && github.event.inputs.public_provider == 'aws-s3'
     uses: ./.github/workflows/template-noti-discord.yaml
diff --git a/.github/workflows/stable-build.yml b/.github/workflows/stable-build.yml
index b05df983d..c4b5f53f3 100644
--- a/.github/workflows/stable-build.yml
+++ b/.github/workflows/stable-build.yml
@@ -9,7 +9,7 @@ jobs:
   get-update-version:
     uses: ./.github/workflows/template-get-update-version.yml
 
-  get-cortex-llamacpp-latest-version:
+  get-llamacpp-latest-version:
     uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml
 
   create-draft-release:
@@ -39,7 +39,7 @@ jobs:
 
   build-macos:
     uses: ./.github/workflows/template-build-macos.yml
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     secrets: inherit
     with:
       ref: ${{ github.ref }}
@@ -48,12 +48,12 @@ jobs:
       cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: stable
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
 
   build-windows-x64:
     uses: ./.github/workflows/template-build-windows-x64.yml
     secrets: inherit
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       public_provider: github
@@ -64,12 +64,12 @@ jobs:
       ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache'
       channel: stable
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
           
   build-linux-x64:
     uses: ./.github/workflows/template-build-linux.yml
     secrets: inherit
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       public_provider: github
@@ -78,13 +78,13 @@ jobs:
       cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: stable
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
       arch: amd64
 
   build-linux-arm64:
     uses: ./.github/workflows/template-build-linux.yml
     secrets: inherit
-    needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, create-draft-release, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       public_provider: github
@@ -93,13 +93,13 @@ jobs:
       cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake"
       channel: stable
       upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-      cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }}
+      llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }}
       arch: arm64
 
   build-docker-x64:
     uses: ./.github/workflows/template-build-docker-x64.yml
     secrets: inherit
-    needs: [get-update-version, get-cortex-llamacpp-latest-version]
+    needs: [get-update-version, get-llamacpp-latest-version]
     with:
       ref: ${{ github.ref }}
       new_version: ${{ needs.get-update-version.outputs.new_version }}
diff --git a/.github/workflows/template-build-linux.yml b/.github/workflows/template-build-linux.yml
index 3fa802ad4..0ebd04176 100644
--- a/.github/workflows/template-build-linux.yml
+++ b/.github/workflows/template-build-linux.yml
@@ -44,7 +44,7 @@ on:
         type: string
         default: 'nightly'
         description: 'The channel to use for this job'
-      cortex-llamacpp-version:
+      llamacpp-version:
         required: true
         type: string
         default: '0.0.0'
@@ -169,23 +169,23 @@ jobs:
           mkdir -p engine/templates/linux/dependencies
           cd engine/templates/linux/dependencies
           if [ "${{ inputs.arch }}" == "amd64" ]; then
-            # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-11-7.tar.gz
-            # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-12-0.tar.gz
-            # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-11-7.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-12-0.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2.tar.gz
-            # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-11-7.tar.gz
-            # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-12-0.tar.gz
-            # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-11-7.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-12-0.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-vulkan.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-linux-amd64.tar.gz
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-linux-amd64.tar.gz
+            # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-cuda-cu11.7-x64.tar.gz
+            # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-cuda-cu12.0-x64.tar.gz
+            # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-x64.tar.gz
+            wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx2-cuda-cu11.7-x64.tar.gz
+            wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx2-cuda-cu12.0-x64.tar.gz
+            wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-x64.zip
+            # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-cuda-cu11.7-x64.tar.gz
+            # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-cuda-cu12.0-x64.tar.gz
+            # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-x64.tar.gz
+            wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-cuda-cu11.7-x64.tar.gz
+            wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-cuda-cu12.0-x64.tar.gz
+            wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-x64.tar.gz
+            wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-vulkan-x64.zip
+            wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-linux-cu11.7-x64.tar.gz
+            wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu12.0-x64.tar.gz
           else
-            wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-arm64.tar.gz
+            wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-arm64.zip
           fi
           cd ..
 
diff --git a/.github/workflows/template-build-macos.yml b/.github/workflows/template-build-macos.yml
index 20c7430fb..ea96d2df6 100644
--- a/.github/workflows/template-build-macos.yml
+++ b/.github/workflows/template-build-macos.yml
@@ -39,7 +39,7 @@ on:
         type: string
         default: 'nightly'
         description: 'The channel to use for this job'
-      cortex-llamacpp-version:
+      llamacpp-version:
         required: true
         type: string
         default: '0.0.0'
@@ -253,6 +253,14 @@ jobs:
           cd engine
           make codesign-binary CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}" DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}"
 
+      - name: Code Signing binaries for separate binary
+        run: |
+          codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_name }}
+          codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_server_name }}
+
+          codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_name }}
+          codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_server_name }}
+
       - name: Notary macOS Binary
         run: |
           curl -sSfL https://raw.githubusercontent.com/anchore/quill/main/install.sh | sh -s -- -b /usr/local/bin
@@ -265,6 +273,18 @@ jobs:
           QUILL_NOTARY_ISSUER: ${{ secrets.NOTARY_ISSUER }}
           QUILL_NOTARY_KEY: "/tmp/notary-key.p8"
 
+      - name: Notary macOS Binary for separate binary
+        run: |
+          # Notarize the binary
+          quill notarize ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_name }}
+          quill notarize ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_server_name }}
+          quill notarize ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_name }}
+          quill notarize ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_server_name }}
+        env:
+          QUILL_NOTARY_KEY_ID: ${{ secrets.NOTARY_KEY_ID }}
+          QUILL_NOTARY_ISSUER: ${{ secrets.NOTARY_ISSUER }}
+          QUILL_NOTARY_KEY: "/tmp/notary-key.p8"
+
       - name: Build network Installers
         shell: bash
         run: |
@@ -289,8 +309,8 @@ jobs:
         run: |
           mkdir -p engine/templates/macos/Scripts/dependencies
           cd engine/templates/macos/Scripts/dependencies
-          wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-arm64.tar.gz
-          wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-amd64.tar.gz
+          wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-macos-arm64.tar.gz
+          wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-macos-x64.tar.gz
 
           cd ../../
           chmod +x create_pkg_local.sh
@@ -310,6 +330,24 @@ jobs:
           xcrun notarytool submit ${{ steps.set-output-params.outputs.package_name }}-local.pkg --apple-id ${{ secrets.APPLE_ID }} --password ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }} --team-id ${{ secrets.APPLE_TEAM_ID }} --wait
 
       - name: Package
+        run: |
+          mkdir temp
+          # Mac arm64
+          mv cortex-${{ inputs.new_version }}-mac-arm64 temp/cortex
+          cd temp
+          tar -czvf cortex-arm64.tar.gz cortex
+          mv cortex-arm64.tar.gz ../cortex-arm64.tar.gz
+          cd ..
+          rm -rf temp/cortex
+
+          # Mac amd64
+          mv cortex-${{ inputs.new_version }}-mac-amd64 temp/cortex
+          cd temp
+          tar -czvf cortex-amd64.tar.gz cortex
+          mv cortex-amd64.tar.gz ../cortex-amd64.tar.gz
+          cd ..
+
+      - name: Package for separate binary
         run: |
           cd engine
           make package
@@ -320,6 +358,18 @@ jobs:
           name: cortex-${{ inputs.new_version }}-mac-universal
           path: ./engine/cortex
   
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: cortex-${{ inputs.new_version }}-mac-arm64-signed
+          path: ./cortex-${{ inputs.new_version }}-mac-arm64
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: cortex-${{ inputs.new_version }}-mac-amd64-signed
+          path: ./cortex-${{ inputs.new_version }}-mac-amd64
+
       - name: Upload Artifact
         uses: actions/upload-artifact@v4
         with:
@@ -358,6 +408,28 @@ jobs:
           asset_name: cortex-${{ inputs.new_version }}-mac-universal.tar.gz
           asset_content_type: application/zip
 
+      - name: Upload release assert if public provider is github
+        if: inputs.public_provider == 'github'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        uses: actions/upload-release-asset@v1.0.1
+        with:
+          upload_url: ${{ inputs.upload_url }}
+          asset_path: ./cortex-arm64.tar.gz
+          asset_name: cortex-${{ inputs.new_version }}-mac-arm64.tar.gz
+          asset_content_type: application/zip
+
+      - name: Upload release assert if public provider is github
+        if: inputs.public_provider == 'github'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        uses: actions/upload-release-asset@v1.0.1
+        with:
+          upload_url: ${{ inputs.upload_url }}
+          asset_path: ./cortex-amd64.tar.gz
+          asset_name: cortex-${{ inputs.new_version }}-mac-amd64.tar.gz
+          asset_content_type: application/zip
+
       - name: Upload release assert if public provider is github
         if: inputs.public_provider == 'github'
         env:
diff --git a/.github/workflows/template-build-windows-x64.yml b/.github/workflows/template-build-windows-x64.yml
index b9e0c9937..399e3dd3e 100644
--- a/.github/workflows/template-build-windows-x64.yml
+++ b/.github/workflows/template-build-windows-x64.yml
@@ -44,7 +44,7 @@ on:
         type: string
         default: 'nightly'
         description: 'The channel to use for this job'
-      cortex-llamacpp-version:
+      llamacpp-version:
         required: true
         type: string
         default: '0.0.0'
@@ -205,21 +205,21 @@ jobs:
         run: |
           mkdir dependencies
           cd dependencies
-          # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-11-7.tar.gz
-          # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-12-0.tar.gz
-          # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-11-7.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-12-0.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2.tar.gz
-          # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-11-7.tar.gz
-          # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-12-0.tar.gz
-          # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-11-7.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-12-0.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-vulkan.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-windows-amd64.tar.gz
-          wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-windows-amd64.tar.gz
+          # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-cuda-cu11.7-x64.tar.gz
+          # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-cuda-cu12.0-x64.tar.gz
+          # wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-x64.zip
+          wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-cuda-cu11.7-x64.tar.gz
+          wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-cuda-cu12.0-x64.tar.gz
+          wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-x64.zip
+          # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-cuda-cu11.7-x64.tar.gz
+          # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-cuda-cu12.0-x64.tar.gz
+          # wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-x64.zip
+          wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-cuda-cu11.7-x64.tar.gz
+          wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-cuda-cu12.0-x64.tar.gz
+          wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-x64.zip
+          wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-vulkan-x64.zip
+          wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu11.7-x64.tar.gz
+          wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu12.0-x64.tar.gz
 
       - name: Enable long paths
         run: |
diff --git a/.github/workflows/template-cortex-llamacpp-latest-version.yml b/.github/workflows/template-cortex-llamacpp-latest-version.yml
index 610b1a89a..3d7b74e56 100644
--- a/.github/workflows/template-cortex-llamacpp-latest-version.yml
+++ b/.github/workflows/template-cortex-llamacpp-latest-version.yml
@@ -1,13 +1,13 @@
-name: get-cortex-llamacpp-latest-version
+name: get-llamacpp-latest-version
 on:
   workflow_call:
     outputs:
-      cortex_llamacpp_latest_version:
+      llamacpp_latest_version:
         description: 'The latest version of cortex.llamacpp engines'
-        value: ${{ jobs.get-cortex-llamacpp-latest-version.outputs.new_version }}
+        value: ${{ jobs.get-llamacpp-latest-version.outputs.new_version }}
 
 jobs:
-  get-cortex-llamacpp-latest-version:
+  get-llamacpp-latest-version:
     runs-on: ubuntu-latest
     outputs:
       new_version: ${{ steps.version_update.outputs.new_version }}
@@ -24,7 +24,7 @@ jobs:
           local max_retries=3
           local tag
           while [ $retries -lt $max_retries ]; do
-            tag=$(curl -s https://api.github.com/repos/menloresearch/cortex.llamacpp/releases/latest | jq -r .tag_name)
+            tag=$(curl -s https://api.github.com/repos/menloresearch/llama.cpp/releases/latest | jq -r .tag_name)
             if [ -n "$tag" ] && [ "$tag" != "null" ]; then
               echo $tag
               return
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 744c3899c..5f04da12e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -24,7 +24,6 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul
     apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
     apt-get update && \
     apt-get install -y --no-install-recommends \
-    cmake \
     make \
     git \
     uuid-dev \
@@ -37,11 +36,21 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul
     ninja-build \
     pkg-config \
     python3-pip \
-    openssl && \
+    openssl \
+    libssl-dev && \
     pip3 install awscli && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# Download and install CMake 3.22.6
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.6/cmake-3.22.6.tar.gz -q -O /tmp/cmake.tar.gz && \
+    tar -xzf /tmp/cmake.tar.gz -C /tmp && \
+    cd /tmp/cmake-3.22.6 && \
+    ./bootstrap && \
+    make -j$(nproc) && \
+    make install && \
+    rm -rf /tmp/cmake.tar.gz /tmp/cmake-3.22.6
+
 ARG CORTEX_CPP_VERSION=latest
 ARG CMAKE_EXTRA_FLAGS=""
 
diff --git a/docker/Dockerfile.cache b/docker/Dockerfile.cache
index 0a9cbe02d..3eabc5dce 100644
--- a/docker/Dockerfile.cache
+++ b/docker/Dockerfile.cache
@@ -24,7 +24,6 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul
     apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
     apt-get update && \
     apt-get install -y --no-install-recommends \
-    cmake \
     make \
     git \
     uuid-dev \
@@ -37,11 +36,21 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul
     ninja-build \
     pkg-config \
     python3-pip \
-    openssl && \
+    openssl \
+    libssl-dev && \
     pip3 install awscli && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# Download and install CMake 3.22.6
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.6/cmake-3.22.6.tar.gz -q -O /tmp/cmake.tar.gz && \
+    tar -xzf /tmp/cmake.tar.gz -C /tmp && \
+    cd /tmp/cmake-3.22.6 && \
+    ./bootstrap && \
+    make -j$(nproc) && \
+    make install && \
+    rm -rf /tmp/cmake.tar.gz /tmp/cmake-3.22.6
+
 ARG CORTEX_CPP_VERSION=latest
 ARG CMAKE_EXTRA_FLAGS=""
 
diff --git a/docs/docs/engines/engine-extension.mdx b/docs/docs/engines/engine-extension.mdx
index d2edde830..8b550c5a4 100644
--- a/docs/docs/engines/engine-extension.mdx
+++ b/docs/docs/engines/engine-extension.mdx
@@ -71,9 +71,6 @@ class EngineI {
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
-  // Compatibility and model management
-  virtual bool IsSupported(const std::string& f) = 0;
-
   virtual void GetModels(
       std::shared_ptr<Json::Value> jsonBody,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index 23970ef51..b7d628094 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -2754,7 +2754,7 @@
                       },
                       "version": {
                         "type": "string",
-                        "example": "0.1.35-28.10.24"
+                        "example": "b4920"
                       }
                     }
                   }
@@ -2763,11 +2763,11 @@
                   {
                     "engine": "llama-cpp",
                     "name": "mac-arm64",
-                    "version": "0.1.35-28.10.24"
+                    "version": "b4920"
                   },
                   {
                     "engine": "llama-cpp",
-                    "name": "linux-amd64-avx",
+                    "name": "linux-avx-x64",
                     "version": "0.1.35-27.10.24"
                   }
                 ]
@@ -2901,7 +2901,7 @@
                       "name": {
                         "type": "string",
                         "description": "The name of the variant, including OS, architecture, and capabilities",
-                        "example": "linux-amd64-avx-cuda-11-7"
+                        "example": "linux-avx-x64-cuda-11-7"
                       },
                       "created_at": {
                         "type": "string",
@@ -2973,7 +2973,7 @@
                       },
                       "name": {
                         "type": "string",
-                        "example": "0.1.39-linux-amd64-avx-cuda-11-7"
+                        "example": "llama-b4920-bin-linux-avx-cuda-cu11.7"
                       },
                       "size": {
                         "type": "integer",
@@ -3250,7 +3250,7 @@
                     },
                     "version": {
                       "type": "string",
-                      "example": "0.1.35-28.10.24"
+                      "example": "b4920"
                     }
                   }
                 }
diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt
index f7a20b58b..39052b08e 100644
--- a/engine/CMakeLists.txt
+++ b/engine/CMakeLists.txt
@@ -182,6 +182,7 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/extensions/remote-engine/remote_engine.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/extensions/local-engine/local_engine.cc
 
 )
 
@@ -227,3 +228,12 @@ set_target_properties(${TARGET_NAME} PROPERTIES
                       RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}
                       RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
 )
+
+if(MSVC)
+  add_custom_command(
+    TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory
+    ${CMAKE_CURRENT_SOURCE_DIR}/../.github/patches/windows
+    ${CMAKE_BINARY_DIR}/
+  )
+endif()
\ No newline at end of file
diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt
index 4163042d0..bb18433fe 100644
--- a/engine/cli/CMakeLists.txt
+++ b/engine/cli/CMakeLists.txt
@@ -73,7 +73,7 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../services/hardware_service.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../services/database_service.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/remote-engine/remote_engine.cc
-
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/local-engine/local_engine.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/template_renderer.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/easywsclient.cc
diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc
index 99f51983e..aa0b9aab4 100644
--- a/engine/cli/command_line_parser.cc
+++ b/engine/cli/command_line_parser.cc
@@ -33,6 +33,7 @@
 #include "services/engine_service.h"
 #include "utils/file_manager_utils.h"
 #include "utils/logging_utils.h"
+#include "utils/task_queue.h"
 
 namespace {
 constexpr const auto kCommonCommandsGroup = "Common Commands";
@@ -50,8 +51,7 @@ CommandLineParser::CommandLineParser()
       download_service_{std::make_shared<DownloadService>()},
       dylib_path_manager_{std::make_shared<cortex::DylibPathManager>()},
       db_service_{std::make_shared<DatabaseService>()},
-      engine_service_{std::make_shared<EngineService>(
-          download_service_, dylib_path_manager_, db_service_)} {}
+      engine_service_{std::make_shared<EngineService>(dylib_path_manager_)} {}
 
 bool CommandLineParser::SetupCommand(int argc, char** argv) {
   app_.usage("Usage:\n" + commands::GetCortexBinary() +
diff --git a/engine/cli/commands/cortex_upd_cmd.cc b/engine/cli/commands/cortex_upd_cmd.cc
index e11ad4290..33a51ed53 100644
--- a/engine/cli/commands/cortex_upd_cmd.cc
+++ b/engine/cli/commands/cortex_upd_cmd.cc
@@ -532,10 +532,10 @@ bool CortexUpdCmd::GetLinuxInstallScript(const std::string& v,
                                          const std::string& channel) {
   std::vector<std::string> path_list;
   if (channel == "nightly") {
-    path_list = {"menloresearch",     "cortex.cpp", "dev",       "engine",
+    path_list = {kMenloOrg,     "cortex.cpp", "dev",       "engine",
                  "templates", "linux",      "install.sh"};
   } else {
-    path_list = {"menloresearch",     "cortex.cpp", "main",      "engine",
+    path_list = {kMenloOrg,     "cortex.cpp", "main",      "engine",
                  "templates", "linux",      "install.sh"};
   }
   auto url_obj = url_parser::Url{
diff --git a/engine/cli/commands/cortex_upd_cmd.h b/engine/cli/commands/cortex_upd_cmd.h
index 7f02839cf..fdee6cc49 100644
--- a/engine/cli/commands/cortex_upd_cmd.h
+++ b/engine/cli/commands/cortex_upd_cmd.h
@@ -79,9 +79,9 @@ inline std::vector<std::string> GetReleasePath() {
   if (CORTEX_VARIANT == file_manager_utils::kNightlyVariant) {
     return {"cortex", "latest", "version.json"};
   } else if (CORTEX_VARIANT == file_manager_utils::kBetaVariant) {
-    return {"repos", "menloresearch", "cortex.cpp", "releases"};
+    return {"repos", kMenloOrg, "cortex.cpp", "releases"};
   } else {
-    return {"repos", "menloresearch", "cortex.cpp", "releases", "latest"};
+    return {"repos", kMenloOrg, "cortex.cpp", "releases", "latest"};
   }
 }
 
diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc
index bebfdb8ce..b31aecaa6 100644
--- a/engine/cli/commands/engine_install_cmd.cc
+++ b/engine/cli/commands/engine_install_cmd.cc
@@ -92,7 +92,10 @@ bool EngineInstallCmd::Exec(const std::string& engine,
     std::vector<std::string> variant_selections;
     for (const auto& variant : variant_result.value()) {
       auto v_name = variant["name"].asString();
-      if (string_utils::StringContainsIgnoreCase(v_name, hw_inf_.sys_inf->os) &&
+      if ((string_utils::StringContainsIgnoreCase(v_name,
+                                                  hw_inf_.sys_inf->os) ||
+           (hw_inf_.sys_inf->os == kLinuxOs &&
+            string_utils::StringContainsIgnoreCase(v_name, kUbuntuOs))) &&
           string_utils::StringContainsIgnoreCase(v_name,
                                                  hw_inf_.sys_inf->arch)) {
         variant_selections.push_back(variant["name"].asString());
diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc
index af2d647e2..e074ee18a 100644
--- a/engine/cli/commands/server_start_cmd.cc
+++ b/engine/cli/commands/server_start_cmd.cc
@@ -106,10 +106,8 @@ bool ServerStartCmd::Exec(const std::string& host, int port,
 #else
   std::vector<std::string> commands;
   // Some engines requires to add lib search path before process being created
-  auto download_srv = std::make_shared<DownloadService>();
-  auto dylib_path_mng = std::make_shared<cortex::DylibPathManager>();
-  auto db_srv = std::make_shared<DatabaseService>();
-  EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath();
+  EngineService(std::make_shared<cortex::DylibPathManager>())
+      .RegisterEngineLibPath();
 
   std::string p = cortex_utils::GetCurrentPath() + "/" + exe;
   commands.push_back(p);
diff --git a/engine/cli/main.cc b/engine/cli/main.cc
index a4e6c38cc..1fa45d6fd 100644
--- a/engine/cli/main.cc
+++ b/engine/cli/main.cc
@@ -155,7 +155,7 @@ int main(int argc, char* argv[]) {
       auto get_latest_version = []() -> cpp::result<std::string, std::string> {
         try {
           auto res = github_release_utils::GetReleaseByVersion(
-              "menloresearch", "cortex.llamacpp", "latest");
+              kGgmlOrg, kLlamaRepo, "latest");
           if (res.has_error()) {
             CTL_ERR("Failed to get latest llama.cpp version: " << res.error());
             return cpp::fail("Failed to get latest llama.cpp version: " +
diff --git a/engine/cli/utils/download_progress.cc b/engine/cli/utils/download_progress.cc
index 7538fff46..32cc6e20a 100644
--- a/engine/cli/utils/download_progress.cc
+++ b/engine/cli/utils/download_progress.cc
@@ -83,8 +83,8 @@ bool DownloadProgress::Handle(
                          size_t max_length = 20) -> std::string {
       // Check the length of the input string
       if (str.length() >= max_length) {
-        return str.substr(
-            0, max_length);  // Return truncated string if it's too long
+        return str.substr(0, max_length - 3) +
+               ".. ";  // Return truncated string if it's too long
       }
 
       // Calculate the number of spaces needed
diff --git a/engine/controllers/engines.cc b/engine/controllers/engines.cc
index f7deb41eb..2a9427abf 100644
--- a/engine/controllers/engines.cc
+++ b/engine/controllers/engines.cc
@@ -155,6 +155,7 @@ void Engines::GetEngineVariants(
       releases.append(json.value());
     }
   }
+  CTL_INF(releases.toStyledString());
   auto resp = cortex_utils::CreateCortexHttpJsonResponse(releases);
   resp->setStatusCode(k200OK);
   callback(resp);
@@ -177,6 +178,8 @@ void Engines::InstallEngine(
     }
     norm_version = version;
   }
+  CTL_INF("version: " << norm_version
+                      << ", norm_variant: " << norm_variant.value_or(""));
 
   auto result =
       engine_service_->InstallEngineAsync(engine, norm_version, norm_variant);
diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
index 079b69423..6ea733a70 100644
--- a/engine/controllers/server.cc
+++ b/engine/controllers/server.cc
@@ -138,7 +138,7 @@ void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
   auto err_or_done = std::make_shared<std::atomic_bool>(false);
   auto chunked_content_provider = [this, q, err_or_done, engine_type, model_id](
                                       char* buf,
-                                       std::size_t buf_size) -> std::size_t {
+                                      std::size_t buf_size) -> std::size_t {
     if (buf == nullptr) {
       LOG_TRACE << "Buf is null";
       if (!(*err_or_done)) {
diff --git a/engine/cortex-common/EngineI.h b/engine/cortex-common/EngineI.h
index b796ebaed..2518b0ce5 100644
--- a/engine/cortex-common/EngineI.h
+++ b/engine/cortex-common/EngineI.h
@@ -47,9 +47,6 @@ class EngineI {
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
-  // For backward compatible checking
-  virtual bool IsSupported(const std::string& f) = 0;
-
   // Get list of running models
   virtual void GetModels(
       std::shared_ptr<Json::Value> jsonBody,
diff --git a/engine/cortex-common/remote_enginei.h b/engine/cortex-common/remote_enginei.h
index 835f526a0..163490cdc 100644
--- a/engine/cortex-common/remote_enginei.h
+++ b/engine/cortex-common/remote_enginei.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#pragma once
-
 #include <functional>
 #include <memory>
 
diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py
index 7356ef904..842ef2c35 100644
--- a/engine/e2e-test/api/engines/test_api_engine.py
+++ b/engine/e2e-test/api/engines/test_api_engine.py
@@ -28,14 +28,14 @@ def test_engines_get_llamacpp_should_be_successful(self):
         
     # engines install
     def test_engines_install_llamacpp_specific_version_and_variant(self):
-        data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx"}
+        data = {"version": "b4932", "variant": "linux-avx-x64"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
         assert response.status_code == 200
 
     def test_engines_install_llamacpp_specific_version_and_null_variant(self):
-        data = {"version": "v0.1.40-b4354"}
+        data = {"version": "b4932"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
@@ -55,14 +55,14 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self):
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self):
         # install first
-        data = {"variant": "mac-arm64"}
+        data = {"variant": "linux-avx-x64"}
         install_response = requests.post(
             "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data
         )
         await wait_for_websocket_download_success_event(timeout=120)
         assert install_response.status_code == 200
 
-        data = {"version": "v0.1.35"}
+        data = {"version": "b4932"}
         response = requests.delete(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
@@ -72,7 +72,7 @@ async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_fa
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_with_variant_should_be_successful(self):
         # install first
-        data = {"variant": "mac-arm64"}
+        data = {"variant": "linux-avx-x64"}
         install_response = requests.post(
             "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data
         )
@@ -85,7 +85,7 @@ async def test_engines_install_uninstall_llamacpp_with_variant_should_be_success
     def test_engines_install_uninstall_llamacpp_with_specific_variant_and_version_should_be_successful(
         self,
     ):
-        data = {"variant": "mac-arm64", "version": "v0.1.35"}
+        data = {"variant": "linux-avx-x64", "version": "b4932"}
         # install first
         install_response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
index e92afb14b..088cc2474 100644
--- a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
+++ b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
@@ -2,7 +2,7 @@
 import requests
 from utils.test_runner import start_server, stop_server, get_latest_pre_release_tag
 
-latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "cortex.llamacpp")
+latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "llama.cpp")
 
 class TestApiEngineInstall:
 
@@ -23,7 +23,7 @@ def test_engines_install_llamacpp_should_be_successful(self):
         assert response.status_code == 200
 
     def test_engines_install_llamacpp_specific_version_and_variant(self):
-        data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx"}
+        data = {"version": latest_pre_release_tag, "variant": "linux-avx-x64"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
diff --git a/engine/e2e-test/api/engines/test_api_get_default_engine.py b/engine/e2e-test/api/engines/test_api_get_default_engine.py
index 2dfc467a3..f0566128c 100644
--- a/engine/e2e-test/api/engines/test_api_get_default_engine.py
+++ b/engine/e2e-test/api/engines/test_api_get_default_engine.py
@@ -24,8 +24,8 @@ def setup_and_teardown(self):
     def test_api_get_default_engine_successfully(self):
         # Data test
         engine= "llama-cpp"
-        name= "linux-amd64-avx"
-        version= "v0.1.35-27.10.24"
+        name= "linux-avx-x64"
+        version= "b4932"
     
         data = {"version": version, "variant": name}
         post_install_url = f"http://localhost:3928/v1/engines/{engine}/install"
diff --git a/engine/e2e-test/api/engines/test_api_get_list_engine.py b/engine/e2e-test/api/engines/test_api_get_list_engine.py
index e6baa22a6..38cb45b39 100644
--- a/engine/e2e-test/api/engines/test_api_get_list_engine.py
+++ b/engine/e2e-test/api/engines/test_api_get_list_engine.py
@@ -24,8 +24,8 @@ def setup_and_teardown(self):
     def test_api_get_list_engines_successfully(self):
         # Data test
         engine= "llama-cpp"
-        name= "linux-amd64-avx"
-        version= "v0.1.35-27.10.24"
+        name= "linux-avx-x64"
+        version= "b4932"
         
         post_install_url = f"http://localhost:3928/v1/engines/{engine}/install"
         response = requests.delete(
diff --git a/engine/e2e-test/api/engines/test_api_post_default_engine.py b/engine/e2e-test/api/engines/test_api_post_default_engine.py
index b2b4e4c48..cede78485 100644
--- a/engine/e2e-test/api/engines/test_api_post_default_engine.py
+++ b/engine/e2e-test/api/engines/test_api_post_default_engine.py
@@ -23,8 +23,8 @@ def setup_and_teardown(self):
     def test_api_set_default_engine_successfully(self):
         # Data test
         engine= "llama-cpp"
-        name= "linux-amd64-avx"
-        version= "v0.1.35-27.10.24"
+        name= "linux-avx-x64"
+        version= "b4932"
     
         data = {"version": version, "variant": name}
         post_install_url = f"http://localhost:3928/v1/engines/{engine}/install"
diff --git a/engine/e2e-test/api/files/test_api_create_file.py b/engine/e2e-test/api/files/test_api_create_file.py
index 7c7226f50..03525672d 100644
--- a/engine/e2e-test/api/files/test_api_create_file.py
+++ b/engine/e2e-test/api/files/test_api_create_file.py
@@ -23,7 +23,6 @@ def setup_and_teardown(self):
         # Teardown
         stop_server()
         
-    @pytest.mark.skipif(platform.system() != "Linux", reason="Todo: fix later on Mac and Window")
     def test_api_create_file_successfully(self):
         # Define file path
         file_path_rel = os.path.join("e2e-test", "api", "files", "blank.txt")
diff --git a/engine/e2e-test/api/hardware/test_api_get_hardware.py b/engine/e2e-test/api/hardware/test_api_get_hardware.py
index 59b15ac18..0efecdbdc 100644
--- a/engine/e2e-test/api/hardware/test_api_get_hardware.py
+++ b/engine/e2e-test/api/hardware/test_api_get_hardware.py
@@ -88,25 +88,6 @@ def test_api_get_hardware_successfully(self):
                     "example": True,
                     "description": "Indicates if the GPU is currently activated."
                     },
-                    "additional_information": {
-                    "type": "object",
-                    "properties": {
-                        "compute_cap": {
-                        "type": "string",
-                        "example": "8.6",
-                        "description": "The compute capability of the GPU."
-                        },
-                        "driver_version": {
-                        "type": "string",
-                        "example": "535.183",
-                        "description": "The version of the installed driver."
-                        }
-                    },
-                    "required": [
-                        "compute_cap",
-                        "driver_version"
-                    ]
-                    },
                     "free_vram": {
                     "type": "integer",
                     "example": 23983,
@@ -140,7 +121,6 @@ def test_api_get_hardware_successfully(self):
                 },
                 "required": [
                     "activated",
-                    "additional_information",
                     "free_vram",
                     "id",
                     "name",
diff --git a/engine/e2e-test/api/model/test_api_model.py b/engine/e2e-test/api/model/test_api_model.py
index bacf7e1b0..f370b1daa 100644
--- a/engine/e2e-test/api/model/test_api_model.py
+++ b/engine/e2e-test/api/model/test_api_model.py
@@ -1,6 +1,7 @@
 import pytest
 import requests
 import time
+import platform
 from utils.test_runner import (
     run,
     start_server,
@@ -95,6 +96,7 @@ async def test_models_start_stop_should_be_successful(self):
         time.sleep(30)
 
         print("Pull model")
+        requests.delete("http://localhost:3928/v1/models/tinyllama:1b")
         json_body = {"model": "tinyllama:1b"}
         response = requests.post("http://localhost:3928/v1/models/pull", json=json_body)
         assert response.status_code == 200, f"Failed to pull model: tinyllama:1b"
@@ -110,16 +112,18 @@ async def test_models_start_stop_should_be_successful(self):
         response = requests.get("http://localhost:3928/v1/models")
         assert response.status_code == 200
 
-        print("Start model")
-        json_body = {"model": "tinyllama:1b"}
-        response = requests.post(
-            "http://localhost:3928/v1/models/start", json=json_body
-        )
-        assert response.status_code == 200, f"status_code: {response.status_code}"
+        # Skip tests for linux arm
+        if platform.machine() != "aarch64": 
+            print("Start model")
+            json_body = {"model": "tinyllama:1b"}
+            response = requests.post(
+                "http://localhost:3928/v1/models/start", json=json_body
+            )
+            assert response.status_code == 200, f"status_code: {response.status_code}"
 
-        print("Stop model")
-        response = requests.post("http://localhost:3928/v1/models/stop", json=json_body)
-        assert response.status_code == 200, f"status_code: {response.status_code}"
+            print("Stop model")
+            response = requests.post("http://localhost:3928/v1/models/stop", json=json_body)
+            assert response.status_code == 200, f"status_code: {response.status_code}"
                 
         # update API
         print("Update model")
diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py
index 370ebe3f3..5d520ce8b 100644
--- a/engine/e2e-test/cli/engines/test_cli_engine_install.py
+++ b/engine/e2e-test/cli/engines/test_cli_engine_install.py
@@ -31,25 +31,9 @@ def test_engines_install_llamacpp_should_be_successfully(self):
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
-    @pytest.mark.skipif(reason="Ignore onnx-runtime test")
-    def test_engines_install_onnx_on_macos_should_be_failed(self):
-        exit_code, output, error = run(
-            "Install Engine", ["engines", "install", "onnxruntime"]
-        )
-        assert "is not supported on" in output, "Should display error message"
-        assert exit_code == 0, f"Install engine failed with error: {error}"
-
-    @pytest.mark.skipif(reason="Ignore tensorrt-llm test")
-    def test_engines_install_onnx_on_tensorrt_should_be_failed(self):
-        exit_code, output, error = run(
-            "Install Engine", ["engines", "install", "tensorrt-llm"]
-        )
-        assert "is not supported on" in output, "Should display error message"
-        assert exit_code == 0, f"Install engine failed with error: {error}"
-
     @pytest.mark.skipif(platform.system() == "Windows", reason="Progress bar log issue on Windows")
     def test_engines_install_pre_release_llamacpp(self):
-        engine_version = "v0.1.43"
+        engine_version = "b4932"
         exit_code, output, error = run(
             "Install Engine",
             ["engines", "install", "llama-cpp", "-v", engine_version],
diff --git a/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py b/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py
index 9fc296d60..ea3cae242 100644
--- a/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py
+++ b/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py
@@ -21,7 +21,7 @@
 from api.engines.test_api_get_default_engine import TestApiDefaultEngine
 from api.engines.test_api_get_engine_release import TestApiEngineRelease
 from api.engines.test_api_get_engine_release_latest import TestApiEngineReleaseLatest
-from test_api_post_default_engine import TestApiSetDefaultEngine
+from api.engines.test_api_post_default_engine import TestApiSetDefaultEngine
 from api.model.test_api_model import TestApiModel
 from api.model.test_api_model_import import TestApiModelImport
 from api.files.test_api_create_file import TestApiCreateFile
diff --git a/engine/e2e-test/runner/main.py b/engine/e2e-test/runner/main.py
index 49bdc5131..8a98d0ca3 100644
--- a/engine/e2e-test/runner/main.py
+++ b/engine/e2e-test/runner/main.py
@@ -21,7 +21,7 @@
 from api.engines.test_api_get_default_engine import TestApiDefaultEngine
 from api.engines.test_api_get_engine_release import TestApiEngineRelease
 from api.engines.test_api_get_engine_release_latest import TestApiEngineReleaseLatest
-from test_api_post_default_engine import TestApiSetDefaultEngine
+from api.engines.test_api_post_default_engine import TestApiSetDefaultEngine
 from api.model.test_api_model import TestApiModel
 from api.model.test_api_model_import import TestApiModelImport
 from api.files.test_api_create_file import TestApiCreateFile
diff --git a/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py b/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py
index 7a3c2e232..a22000d93 100644
--- a/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py
+++ b/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py
@@ -125,7 +125,7 @@ async def test_models_on_cortexso_hub(self, model_url):
             "Install Engine", ["engines", "install", "llama-cpp"], timeout=None, capture = False
         )
         root = Path.home()
-        assert os.path.exists(root / "cortexcpp" / "engines" / "cortex.llamacpp" / "version.txt")
+        assert os.path.exists(root / "cortexcpp" / "engines" / "llama.cpp" / "version.txt")
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
         # Start the model
diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc
new file mode 100644
index 000000000..885c14d77
--- /dev/null
+++ b/engine/extensions/local-engine/local_engine.cc
@@ -0,0 +1,1035 @@
+#include "local_engine.h"
+#include <random>
+#include <thread>
+#include <unordered_set>
+#include "utils/curl_utils.h"
+#include "utils/json_helper.h"
+#include "utils/logging_utils.h"
+#include "utils/process/utils.h"
+#include "utils/url_parser.h"
+
+namespace cortex::local {
+
+namespace {
+const std::unordered_set<std::string> kIgnoredParams = {
+    "model",        "model_alias",     "embedding",  "ai_prompt",
+    "ai_template",  "prompt_template", "mmproj",     "system_prompt",
+    "created",      "stream",          "name",       "os",
+    "owned_by",     "files",           "gpu_arch",   "quantization_method",
+    "engine",       "system_template", "max_tokens", "user_template",
+    "user_prompt",  "min_keep",        "mirostat",   "mirostat_eta",
+    "mirostat_tau", "text_model",      "version",    "n_probs",
+    "object",       "penalize_nl",     "precision",  "size",
+    "stop",         "tfs_z",           "typ_p"};
+
+const std::unordered_map<std::string, std::string> kParamsMap = {
+    {"cpu_threads", "--threads"},
+    {"n_ubatch", "--ubatch-size"},
+    {"n_batch", "--batch-size"},
+    {"n_parallel", "--parallel"},
+    {"temperature", "--temp"},
+    {"top_k", "--top-k"},
+    {"top_p", "--top-p"},
+    {"min_p", "--min-p"},
+    {"dynatemp_exponent", "--dynatemp-exp"},
+    {"ctx_len", "--ctx-size"},
+    {"ngl", "-ngl"},
+};
+
+int GenerateRandomInteger(int min, int max) {
+  static std::random_device rd;   // Seed for the random number engine
+  static std::mt19937 gen(rd());  // Mersenne Twister random number engine
+  std::uniform_int_distribution<> dis(
+      min, max);  // Distribution for the desired range
+
+  return dis(gen);  // Generate and return a random integer within the range
+}
+
+std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root) {
+  std::vector<std::string> res;
+  std::string errors;
+
+  for (const auto& member : root.getMemberNames()) {
+    if (member == "model_path" || member == "llama_model_path") {
+      if (!root[member].isNull()) {
+        res.push_back("--model");
+        res.push_back(root[member].asString());
+      }
+      continue;
+    } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) {
+      continue;
+    } else if (kParamsMap.find(member) != kParamsMap.end()) {
+      res.push_back(kParamsMap.at(member));
+      res.push_back(root[member].asString());
+      continue;
+    } else if (member == "model_type") {
+      if (root[member].asString() == "embedding") {
+        res.push_back("--embedding");
+      }
+      continue;
+    }
+
+    res.push_back("--" + member);
+    if (root[member].isString()) {
+      res.push_back(root[member].asString());
+    } else if (root[member].isInt()) {
+      res.push_back(std::to_string(root[member].asInt()));
+    } else if (root[member].isDouble()) {
+      res.push_back(std::to_string(root[member].asDouble()));
+    } else if (root[member].isArray()) {
+      std::stringstream ss;
+      ss << "[";
+      bool first = true;
+      for (const auto& value : root[member]) {
+        if (!first) {
+          ss << ", ";
+        }
+        ss << "\"" << value.asString() << "\"";
+        first = false;
+      }
+      ss << "] ";
+      res.push_back(ss.str());
+    }
+  }
+
+  return res;
+}
+
+constexpr const auto kMinDataChunkSize = 6u;
+
+struct OaiInfo {
+  std::string model;
+  bool include_usage = false;
+  bool oai_endpoint = false;
+  int n_probs = 0;
+};
+
+struct StreamingCallback {
+  std::shared_ptr<http_callback> callback;
+  bool need_stop = true;
+  OaiInfo oi;
+};
+
+struct Usage {
+  int prompt_tokens = 0;
+  int completion_tokens = 0;
+};
+
+std::string GenerateRandomString(std::size_t length) {
+  const std::string characters =
+      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+  std::random_device rd;
+  std::mt19937 generator(rd());
+
+  std::uniform_int_distribution<> distribution(
+      0, static_cast<int>(characters.size()) - 1);
+
+  std::string random_string(length, '\0');
+  std::generate_n(random_string.begin(), length,
+                  [&]() { return characters[distribution(generator)]; });
+
+  return random_string;
+}
+
+std::vector<int> GetUTF8Bytes(const std::string& str) {
+  std::vector<int> bytes;
+  for (unsigned char c : str) {
+    bytes.push_back(static_cast<int>(c));
+  }
+  return bytes;
+}
+
+Json::Value TransformLogProbs(const Json::Value& logprobs) {
+  Json::Value root;
+  Json::Value logprobs_json(Json::arrayValue);
+
+  // Iterate through each token group in the input
+  for (const auto& token_group : logprobs) {
+    Json::Value content_item;
+
+    // Set the token (content)
+    content_item["token"] = token_group["content"].asString();
+
+    // Get the probabilities array
+    const auto& probs = token_group["probs"];
+
+    // Set the main token's logprob (first probability)
+    if (!probs.empty()) {
+      content_item["logprob"] = std::log(
+          probs[0]["prob"].asDouble() + std::numeric_limits<double>::epsilon());
+    }
+
+    // Get UTF-8 bytes for the token
+    auto bytes = GetUTF8Bytes(token_group["content"].asString());
+    Json::Value bytes_array(Json::arrayValue);
+    for (int byte : bytes) {
+      bytes_array.append(byte);
+    }
+    content_item["bytes"] = bytes_array;
+
+    // Create top_logprobs array
+    Json::Value top_logprobs(Json::arrayValue);
+    for (const auto& prob_item : probs) {
+      Json::Value logprob_item;
+      logprob_item["token"] = prob_item["tok_str"].asString();
+      logprob_item["logprob"] =
+          std::log(prob_item["prob"].asDouble() +
+                   std::numeric_limits<double>::epsilon());
+
+      // Get UTF-8 bytes for this alternative token
+      auto alt_bytes = GetUTF8Bytes(prob_item["tok_str"].asString());
+      Json::Value alt_bytes_array(Json::arrayValue);
+      for (int byte : alt_bytes) {
+        alt_bytes_array.append(byte);
+      }
+      logprob_item["bytes"] = alt_bytes_array;
+
+      top_logprobs.append(logprob_item);
+    }
+    content_item["top_logprobs"] = top_logprobs;
+
+    logprobs_json.append(content_item);
+  }
+  root["content"] = logprobs_json;
+  return root;
+}
+
+std::string CreateReturnJson(
+    const std::string& id, const std::string& model, const std::string& content,
+    Json::Value finish_reason, bool include_usage,
+    std::optional<Usage> usage = std::nullopt,
+    std::optional<Json::Value> logprobs = std::nullopt) {
+  Json::Value root;
+
+  root["id"] = id;
+  root["model"] = model;
+  root["created"] = static_cast<int>(std::time(nullptr));
+  root["object"] = "chat.completion.chunk";
+
+  Json::Value choicesArray(Json::arrayValue);
+  // If usage, the choices field will always be an empty array
+  if (!usage) {
+    Json::Value choice;
+
+    choice["index"] = 0;
+    Json::Value delta;
+    delta["content"] = content;
+    delta["role"] = "assistant";
+    choice["delta"] = delta;
+    choice["finish_reason"] = finish_reason;
+    if (logprobs.has_value() && !logprobs.value().empty()) {
+      choice["logprobs"] = TransformLogProbs(logprobs.value());
+    }
+
+    choicesArray.append(choice);
+  }
+  root["choices"] = choicesArray;
+  if (include_usage) {
+    if (usage) {
+      Json::Value usage_json;
+      Json::Value details;
+      details["reasoning_tokens"] = 0;
+      usage_json["prompt_tokens"] = (*usage).prompt_tokens;
+      usage_json["completion_tokens"] = (*usage).completion_tokens;
+      usage_json["total_tokens"] =
+          (*usage).prompt_tokens + (*usage).completion_tokens;
+      usage_json["completion_tokens_details"] = details;
+      root["usage"] = usage_json;
+    } else {
+      root["usage"] = Json::Value();
+    }
+  }
+
+  Json::StreamWriterBuilder writer;
+  writer["indentation"] = "";  // This sets the indentation to an empty string,
+  // producing compact output.
+  return Json::writeString(writer, root);
+}
+
+size_t WriteCallback(char* ptr, size_t size, size_t nmemb, void* userdata) {
+  auto* sc = static_cast<StreamingCallback*>(userdata);
+  size_t data_length = size * nmemb;
+
+  if (ptr && data_length > kMinDataChunkSize) {
+    std::string chunk(ptr + kMinDataChunkSize, data_length - kMinDataChunkSize);
+    CTL_DBG(chunk);
+    if (sc->oi.oai_endpoint) {
+      if (chunk.find("[DONE]") != std::string::npos) {
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = false;
+        status["is_stream"] = true;
+        status["status_code"] = 200;
+        Json::Value chunk_json;
+        chunk_json["data"] = "data: [DONE]";
+        sc->need_stop = false;
+        (*sc->callback)(std::move(status), std::move(chunk_json));
+        return data_length;
+      }
+      if (!sc->oi.include_usage &&
+          chunk.find("completion_tokens") != std::string::npos) {
+        return data_length;
+      }
+
+      Json::Value chunk_json;
+      chunk_json["data"] = "data: " + chunk;
+      Json::Value status;
+      status["is_done"] = false;
+      status["has_error"] = false;
+      status["is_stream"] = true;
+      status["status_code"] = 200;
+      (*sc->callback)(std::move(status), std::move(chunk_json));
+    } else {
+      if (chunk.find("[DONE]") != std::string::npos) {
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = false;
+        status["is_stream"] = true;
+        status["status_code"] = 200;
+        Json::Value chunk_json;
+        chunk_json["data"] = "data: [DONE]";
+        sc->need_stop = false;
+        (*sc->callback)(std::move(status), std::move(chunk_json));
+        return data_length;
+      }
+      auto json_data = json_helper::ParseJsonString(chunk);
+      // DONE
+      if (!json_data.isNull() && json_data.isMember("timings")) {
+        std::optional<Usage> u;
+        if (sc->oi.include_usage) {
+          u = Usage{json_data["tokens_evaluated"].asInt(),
+                    json_data["tokens_predicted"].asInt()};
+        }
+
+        Json::Value chunk_json;
+        chunk_json["data"] =
+            "data: " + CreateReturnJson(GenerateRandomString(20), sc->oi.model,
+                                        "", "stop", sc->oi.include_usage, u);
+        Json::Value status;
+        status["is_done"] = false;
+        status["has_error"] = false;
+        status["is_stream"] = true;
+        status["status_code"] = 200;
+        (*sc->callback)(std::move(status), std::move(chunk_json));
+
+        sc->need_stop = false;
+        return data_length;
+      }
+
+      Json::Value logprobs;
+      if (sc->oi.n_probs > 0) {
+        logprobs = json_data["completion_probabilities"];
+      }
+      std::string to_send;
+      if (json_data.isMember("choices") && json_data["choices"].isArray() &&
+          json_data["choices"].size() > 0) {
+        to_send = json_data["choices"][0].get("text", "").asString();
+      }
+      CTL_DBG(to_send);
+      const std::string str =
+          CreateReturnJson(GenerateRandomString(20), sc->oi.model, to_send, "",
+                           sc->oi.include_usage, std::nullopt, logprobs);
+      Json::Value chunk_json;
+      chunk_json["data"] = "data: " + str;
+      Json::Value status;
+      status["is_done"] = false;
+      status["has_error"] = false;
+      status["is_stream"] = true;
+      status["status_code"] = 200;
+      (*sc->callback)(std::move(status), std::move(chunk_json));
+      return data_length;
+    }
+  }
+
+  return data_length;
+}
+
+Json::Value ConvertLogitBiasToArray(const Json::Value& input) {
+  Json::Value result(Json::arrayValue);
+  if (input.isObject()) {
+    const auto& member_names = input.getMemberNames();
+    for (const auto& tokenStr : member_names) {
+      Json::Value pair(Json::arrayValue);
+      pair.append(std::stoi(tokenStr));
+      pair.append(input[tokenStr].asFloat());
+      result.append(pair);
+    }
+  }
+  return result;
+}
+
+Json::Value CreateFullReturnJson(
+    const std::string& id, const std::string& model, const std::string& content,
+    const std::string& system_fingerprint, int prompt_tokens,
+    int completion_tokens, Json::Value finish_reason = Json::Value(),
+    std::optional<Json::Value> logprobs = std::nullopt) {
+  Json::Value root;
+
+  root["id"] = id;
+  root["model"] = model;
+  root["created"] = static_cast<int>(std::time(nullptr));
+  root["object"] = "chat.completion";
+  root["system_fingerprint"] = system_fingerprint;
+
+  Json::Value choicesArray(Json::arrayValue);
+  Json::Value choice;
+
+  choice["index"] = 0;
+  Json::Value message;
+  message["role"] = "assistant";
+  message["content"] = content;
+  choice["message"] = message;
+  choice["finish_reason"] = finish_reason;
+  if (logprobs.has_value() && !logprobs.value().empty()) {
+    choice["logprobs"] = TransformLogProbs(logprobs.value());
+  }
+
+  choicesArray.append(choice);
+  root["choices"] = choicesArray;
+
+  Json::Value usage;
+  usage["prompt_tokens"] = prompt_tokens;
+  usage["completion_tokens"] = completion_tokens;
+  usage["total_tokens"] = prompt_tokens + completion_tokens;
+  root["usage"] = usage;
+
+  return root;
+}
+
+}  // namespace
+
+LocalEngine::~LocalEngine() {
+  for (auto& [_, si] : server_map_) {
+    (void)cortex::process::KillProcess(si.process_info);
+  }
+  server_map_.clear();
+}
+void LocalEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_body,
+                                       http_callback&& callback) {
+  auto model_id = json_body->get("model", "").asString();
+  if (model_id.empty()) {
+    CTL_WRN("Model is empty");
+  }
+  if (server_map_.find(model_id) != server_map_.end()) {
+    auto& s = server_map_[model_id];
+    auto oaicompat = [&json_body]() -> bool {
+      if (json_body->isMember("logprobs") &&
+          (*json_body)["logprobs"].asBool()) {
+        return false;
+      }
+      return true;
+    }();
+    if (oaicompat) {
+      HandleOpenAiChatCompletion(
+          json_body, const_cast<http_callback&&>(callback), model_id);
+    } else {
+      HandleNonOpenAiChatCompletion(
+          json_body, const_cast<http_callback&&>(callback), model_id);
+    }
+  } else {
+    Json::Value error;
+    error["error"] = "Model is not loaded yet: " + model_id;
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = true;
+    status["is_stream"] = false;
+    status["status_code"] = 400;
+    callback(std::move(status), std::move(error));
+  }
+}
+
+void LocalEngine::HandleEmbedding(std::shared_ptr<Json::Value> json_body,
+                                  http_callback&& callback) {
+  auto model_id = json_body->get("model", "").asString();
+  if (model_id.empty()) {
+    CTL_WRN("Model is empty");
+  }
+  if (server_map_.find(model_id) != server_map_.end()) {
+    auto& s = server_map_[model_id];
+    auto url = url_parser::Url{
+        /*.protocol*/ "http",
+        /*.host*/ s.host + ":" + std::to_string(s.port),
+        /*.pathParams*/ {"v1", "embeddings"},
+        /* .queries = */ {},
+    };
+
+    auto response = curl_utils::SimplePostJson(url.ToFullPath(),
+                                               json_body->toStyledString());
+
+    if (response.has_error()) {
+      CTL_WRN("Error: " << response.error());
+      Json::Value error;
+      error["error"] = response.error();
+      Json::Value status;
+      status["is_done"] = true;
+      status["has_error"] = true;
+      status["is_stream"] = false;
+      status["status_code"] = 400;
+      callback(std::move(status), std::move(error));
+    } else {
+      Json::Value status;
+      status["is_done"] = true;
+      status["has_error"] = false;
+      status["is_stream"] = false;
+      status["status_code"] = 200;
+      callback(std::move(status), std::move(response.value()));
+    }
+  } else {
+    Json::Value error;
+    error["error"] = "Model is not loaded yet: " + model_id;
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = true;
+    status["is_stream"] = false;
+    status["status_code"] = 400;
+    callback(std::move(status), std::move(error));
+  }
+}
+
+void LocalEngine::LoadModel(std::shared_ptr<Json::Value> json_body,
+                            http_callback&& callback) {
+  CTL_INF("Start loading model");
+  auto wait_for_server_up = [this](const std::string& model,
+                                   const std::string& host, int port) {
+    auto url = url_parser::Url{
+        /*.protocol*/ "http",
+        /*.host*/ host + ":" + std::to_string(port),
+        /*.pathParams*/ {"health"},
+        /*.queries*/ {},
+    };
+    while (server_map_.find(model) != server_map_.end()) {
+      auto res = curl_utils::SimpleGet(url.ToFullPath());
+      if (res.has_error()) {
+        LOG_INFO << "Wait for server up ..";
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      } else {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  LOG_DEBUG << "Start to spawn llama-server";
+  auto model_id = json_body->get("model", "").asString();
+  if (model_id.empty()) {
+    CTL_WRN("Model is empty");
+  }
+  server_map_[model_id].host = "127.0.0.1";
+  server_map_[model_id].port = GenerateRandomInteger(39400, 39999);
+  auto& s = server_map_[model_id];
+  s.pre_prompt = json_body->get("pre_prompt", "").asString();
+  s.user_prompt = json_body->get("user_prompt", "USER: ").asString();
+  s.ai_prompt = json_body->get("ai_prompt", "ASSISTANT: ").asString();
+  s.system_prompt =
+      json_body->get("system_prompt", "ASSISTANT's RULE: ").asString();
+  std::vector<std::string> params = ConvertJsonToParamsVector(*json_body);
+  params.push_back("--host");
+  params.push_back(s.host);
+  params.push_back("--port");
+  params.push_back(std::to_string(s.port));
+
+  params.push_back("--pooling");
+  params.push_back("mean");
+
+  std::vector<std::string> v;
+  v.reserve(params.size() + 1);
+  auto engine_dir = engine_service_.GetEngineDirPath(kLlamaRepo);
+  if (engine_dir.has_error()) {
+    CTL_WRN(engine_dir.error());
+    server_map_.erase(model_id);
+    return;
+  }
+  auto exe = (engine_dir.value().first / kLlamaServer).string();
+
+  v.push_back(exe);
+  v.insert(v.end(), params.begin(), params.end());
+  engine_service_.RegisterEngineLibPath();
+
+  auto log_path =
+      (file_manager_utils::GetCortexLogPath() / "logs" / "cortex.log").string();
+  CTL_DBG("log: " << log_path);
+  auto result = cortex::process::SpawnProcess(v, log_path, log_path);
+  if (result.has_error()) {
+    CTL_ERR("Fail to spawn process. " << result.error());
+    Json::Value error;
+    error["error"] = "Fail to spawn process";
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = true;
+    status["is_stream"] = false;
+    status["status_code"] = 500;
+    callback(std::move(status), std::move(error));
+    server_map_.erase(model_id);
+    return;
+  }
+
+  s.process_info = result.value();
+  if (wait_for_server_up(model_id, s.host, s.port)) {
+    s.start_time = std::chrono::system_clock::now().time_since_epoch() /
+                   std::chrono::milliseconds(1);
+    Json::Value response;
+    response["status"] = "Model loaded successfully with pid: " +
+                         std::to_string(s.process_info.pid);
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = false;
+    status["is_stream"] = false;
+    status["status_code"] = 200;
+    callback(std::move(status), std::move(response));
+  } else {
+    server_map_.erase(model_id);
+    Json::Value error;
+    error["error"] = "Wait for server up timeout";
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = true;
+    status["is_stream"] = false;
+    status["status_code"] = 500;
+    callback(std::move(status), std::move(error));
+  }
+}
+
+void LocalEngine::UnloadModel(std::shared_ptr<Json::Value> json_body,
+                              http_callback&& callback) {
+  auto model_id = json_body->get("model", "").asString();
+  if (model_id.empty()) {
+    CTL_WRN("Model is empty");
+  }
+
+  if (server_map_.find(model_id) != server_map_.end()) {
+    auto& s = server_map_[model_id];
+#if defined(_WIN32) || defined(_WIN64)
+    auto sent = cortex::process::KillProcess(s.process_info);
+#else
+    auto sent = (kill(s.process_info.pid, SIGTERM) != -1);
+#endif
+    if (sent) {
+      LOG_INFO << "SIGINT signal sent to child process";
+      Json::Value response;
+      response["status"] = "Model unloaded successfully with pid: " +
+                           std::to_string(s.process_info.pid);
+      Json::Value status;
+      status["is_done"] = true;
+      status["has_error"] = false;
+      status["is_stream"] = false;
+      status["status_code"] = 200;
+      callback(std::move(status), std::move(response));
+      server_map_.erase(model_id);
+    } else {
+      LOG_ERROR << "Failed to send SIGINT signal to child process";
+      Json::Value error;
+      error["error"] = "Failed to unload model: " + model_id;
+      Json::Value status;
+      status["is_done"] = true;
+      status["has_error"] = true;
+      status["is_stream"] = false;
+      status["status_code"] = 500;
+      callback(std::move(status), std::move(error));
+    }
+  } else {
+    Json::Value error;
+    error["error"] = "Model is not loaded yet: " + model_id;
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = true;
+    status["is_stream"] = false;
+    status["status_code"] = 400;
+    callback(std::move(status), std::move(error));
+  }
+}
+
+void LocalEngine::GetModelStatus(std::shared_ptr<Json::Value> json_body,
+                                 http_callback&& callback) {
+  auto model_id = json_body->get("model", "").asString();
+  if (model_id.empty()) {
+    CTL_WRN("Model is empty");
+  }
+  if (server_map_.find(model_id) != server_map_.end()) {
+    Json::Value response;
+    response["status"] = "Model is loaded";
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = false;
+    status["is_stream"] = false;
+    status["status_code"] = 200;
+    callback(std::move(status), std::move(response));
+  } else {
+    Json::Value error;
+    error["error"] = "Model is not loaded yet: " + model_id;
+    Json::Value status;
+    status["is_done"] = true;
+    status["has_error"] = true;
+    status["is_stream"] = false;
+    status["status_code"] = 400;
+    callback(std::move(status), std::move(error));
+  }
+}
+
+void LocalEngine::GetModels(std::shared_ptr<Json::Value> json_body,
+                            http_callback&& callback) {
+  Json::Value json_resp;
+  Json::Value model_array(Json::arrayValue);
+  {
+    for (const auto& [m, s] : server_map_) {
+      Json::Value val;
+      val["id"] = m;
+      val["engine"] = kLlamaEngine;
+      val["start_time"] = s.start_time;
+      val["model_size"] = 0u;
+      val["vram"] = 0u;
+      val["ram"] = 0u;
+      val["object"] = "model";
+      model_array.append(val);
+    }
+  }
+
+  json_resp["object"] = "list";
+  json_resp["data"] = model_array;
+
+  Json::Value status;
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = 200;
+  callback(std::move(status), std::move(json_resp));
+  CTL_INF("Running models responded");
+  (void)json_body;
+}
+
+void LocalEngine::HandleOpenAiChatCompletion(
+    std::shared_ptr<Json::Value> json_body, http_callback&& callback,
+    const std::string& model) {
+  CTL_DBG("Hanle OpenAI chat completion");
+  auto is_stream = (*json_body).get("stream", false).asBool();
+  auto include_usage = [&json_body, is_stream]() -> bool {
+    if (is_stream) {
+      if (json_body->isMember("stream_options") &&
+          !(*json_body)["stream_options"].isNull()) {
+        return (*json_body)["stream_options"]
+            .get("include_usage", false)
+            .asBool();
+      }
+      return false;
+    }
+    return false;
+  }();
+
+  auto n = [&json_body, is_stream]() -> int {
+    if (is_stream)
+      return 1;
+    return (*json_body).get("n", 1).asInt();
+  }();
+
+  auto& s = server_map_.at(model);
+  // Format logit_bias
+  if (json_body->isMember("logit_bias")) {
+    auto logit_bias = ConvertLogitBiasToArray((*json_body)["logit_bias"]);
+    (*json_body)["logit_bias"] = logit_bias;
+  }
+  // llama.cpp server only supports n = 1
+  (*json_body)["n"] = 1;
+
+  auto url = url_parser::Url{
+      /*.protocol*/ "http",
+      /*.host*/ s.host + ":" + std::to_string(s.port),
+      /*.pathParams*/ {"v1", "chat", "completions"},
+      /*.queries*/ {},
+  };
+
+  if (is_stream) {
+    q_.RunInQueue([s, json_body, callback, model, url = std::move(url)] {
+      auto curl = curl_easy_init();
+      if (!curl) {
+        CTL_WRN("Failed to initialize CURL");
+        return;
+      }
+
+      curl_easy_setopt(curl, CURLOPT_URL, url.ToFullPath().c_str());
+      curl_easy_setopt(curl, CURLOPT_POST, 1L);
+      CTL_INF(url.ToFullPath());
+
+      struct curl_slist* headers = nullptr;
+      headers = curl_slist_append(headers, "Content-Type: application/json");
+      curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+      auto json_str = json_body->toStyledString();
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str());
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length());
+      curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L);
+
+      StreamingCallback sc;
+      OaiInfo oi{model, false /*include_usage*/, true /*oai_endpoint*/,
+                 0 /*n_probs*/};
+      sc.callback = std::make_shared<http_callback>(callback);
+      sc.need_stop = true;
+      sc.oi = oi;
+
+      curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
+      curl_easy_setopt(curl, CURLOPT_WRITEDATA, &sc);
+      auto res = curl_easy_perform(curl);
+
+      if (res != CURLE_OK) {
+        CTL_WRN("CURL request failed: " << curl_easy_strerror(res));
+
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = true;
+        status["is_stream"] = true;
+        status["status_code"] = 500;
+
+        Json::Value error;
+        error["error"] = curl_easy_strerror(res);
+        callback(std::move(status), std::move(error));
+      }
+      curl_easy_cleanup(curl);
+      if (sc.need_stop) {
+        CTL_DBG("No stop message received, need to stop");
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = false;
+        status["is_stream"] = true;
+        status["status_code"] = 200;
+        (*sc.callback)(std::move(status), Json::Value());
+      }
+    });
+
+  } else {
+    Json::Value result;
+    // multiple choices
+    for (int i = 0; i < n; i++) {
+      auto response = curl_utils::SimplePostJson(url.ToFullPath(),
+                                                 json_body->toStyledString());
+
+      if (response.has_value()) {
+        auto r = response.value();
+        if (i == 0) {
+          result = r;
+        } else {
+          r["choices"][0]["index"] = i;
+          result["choices"].append(r["choices"][0]);
+          result["usage"]["completion_tokens"] =
+              result["usage"]["completion_tokens"].asInt() +
+              r["usage"]["completion_tokens"].asInt();
+          result["usage"]["prompt_tokens"] =
+              result["usage"]["prompt_tokens"].asInt() +
+              r["usage"]["prompt_tokens"].asInt();
+          result["usage"]["total_tokens"] =
+              result["usage"]["total_tokens"].asInt() +
+              r["usage"]["total_tokens"].asInt();
+        }
+
+        if (i == n - 1) {
+          Json::Value status;
+          status["is_done"] = true;
+          status["has_error"] = false;
+          status["is_stream"] = false;
+          status["status_code"] = 200;
+          callback(std::move(status), std::move(result));
+        }
+      } else {
+        CTL_WRN("Error: " << response.error());
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = true;
+        status["is_stream"] = false;
+        status["status_code"] = 500;
+        callback(std::move(status), std::move(response.value()));
+        break;
+      }
+    }
+  }
+}
+
+// (sang) duplicate code but it is easier to clean when
+// llama-server upstream is fully OpenAI API Compatible
+void LocalEngine::HandleNonOpenAiChatCompletion(
+    std::shared_ptr<Json::Value> json_body, http_callback&& callback,
+    const std::string& model) {
+  CTL_DBG("Hanle NonOpenAI chat completion");
+  auto is_stream = (*json_body).get("stream", false).asBool();
+  auto include_usage = [&json_body, is_stream]() -> bool {
+    if (is_stream) {
+      if (json_body->isMember("stream_options") &&
+          !(*json_body)["stream_options"].isNull()) {
+        return (*json_body)["stream_options"]
+            .get("include_usage", false)
+            .asBool();
+      }
+      return false;
+    }
+    return false;
+  }();
+
+  auto n = [&json_body, is_stream]() -> int {
+    if (is_stream)
+      return 1;
+    return (*json_body).get("n", 1).asInt();
+  }();
+
+  auto& s = server_map_.at(model);
+
+  // Format logit_bias
+  if (json_body->isMember("logit_bias")) {
+    auto logit_bias = ConvertLogitBiasToArray((*json_body)["logit_bias"]);
+    (*json_body)["logit_bias"] = logit_bias;
+  }
+  auto get_message = [](const Json::Value& msg_content) -> std::string {
+    if (msg_content.isArray()) {
+      for (const auto& mc : msg_content) {
+        if (mc["type"].asString() == "text") {
+          return mc["text"].asString();
+        }
+      }
+    } else {
+      return msg_content.asString();
+    }
+    return "";
+  };
+
+  if (!json_body->isMember("prompt") ||
+      (*json_body)["prompt"].asString().empty()) {
+    auto formatted_output = s.pre_prompt;
+    for (const auto& message : (*json_body)["messages"]) {
+      auto input_role = message["role"].asString();
+      std::string role;
+      if (input_role == "user") {
+        role = s.user_prompt;
+      } else if (input_role == "assistant") {
+        role = s.ai_prompt;
+      } else if (input_role == "system") {
+        role = s.system_prompt;
+      } else {
+        role = input_role;
+      }
+
+      if (auto content = get_message(message["content"]); !content.empty()) {
+        formatted_output += role + content;
+      }
+    }
+    formatted_output += s.ai_prompt;
+    (*json_body)["prompt"] = formatted_output;
+  }
+
+  (*json_body)["n"] = 1;
+  int n_probs = json_body->get("n_probs", 0).asInt();
+
+  auto url = url_parser::Url{
+      /*.protocol*/ "http",
+      /*.host*/ s.host + ":" + std::to_string(s.port),
+      /*.pathParams*/ {"v1", "completions"},
+      /*.queries*/ {},
+  };
+
+  if (is_stream) {
+    q_.RunInQueue([s, json_body, callback, n_probs, model,
+                   url = std::move(url)] {
+      auto curl = curl_easy_init();
+      if (!curl) {
+        CTL_WRN("Failed to initialize CURL");
+        return;
+      }
+
+      curl_easy_setopt(curl, CURLOPT_URL, url.ToFullPath().c_str());
+      curl_easy_setopt(curl, CURLOPT_POST, 1L);
+
+      struct curl_slist* headers = nullptr;
+      headers = curl_slist_append(headers, "Content-Type: application/json");
+      curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+      auto json_str = json_body->toStyledString();
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str());
+      curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length());
+      curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L);
+
+      StreamingCallback sc;
+      OaiInfo oi{model, false /*include_usage*/, false /*oai_endpoint*/,
+                 n_probs};
+      sc.callback = std::make_shared<http_callback>(callback);
+      sc.need_stop = true;
+      sc.oi = oi;
+
+      curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
+      curl_easy_setopt(curl, CURLOPT_WRITEDATA, &sc);
+      auto res = curl_easy_perform(curl);
+
+      if (res != CURLE_OK) {
+        CTL_WRN("CURL request failed: " << curl_easy_strerror(res));
+
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = true;
+        status["is_stream"] = true;
+        status["status_code"] = 500;
+
+        Json::Value error;
+        error["error"] = curl_easy_strerror(res);
+        callback(std::move(status), std::move(error));
+      }
+      curl_easy_cleanup(curl);
+      if (sc.need_stop) {
+        CTL_DBG("No stop message received, need to stop");
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = false;
+        status["is_stream"] = true;
+        status["status_code"] = 200;
+        (*sc.callback)(std::move(status), Json::Value());
+      }
+    });
+
+  } else {
+
+    Json::Value result;
+    int prompt_tokens = 0;
+    int predicted_tokens = 0;
+    // multiple choices
+    for (int i = 0; i < n; i++) {
+      auto response = curl_utils::SimplePostJson(url.ToFullPath(),
+                                                 json_body->toStyledString());
+      if (response.has_value()) {
+        auto r = response.value();
+        Json::Value logprobs;
+        prompt_tokens += r["tokens_evaluated"].asInt();
+        predicted_tokens += r["tokens_predicted"].asInt();
+        std::string to_send = r["content"].asString();
+        string_utils::LTrim(to_send);
+        if (n_probs > 0) {
+          logprobs = r["completion_probabilities"];
+        }
+        if (i == 0) {
+          result = CreateFullReturnJson(
+              GenerateRandomString(20), model, to_send, "_", prompt_tokens,
+              predicted_tokens, Json::Value("stop"), logprobs);
+        } else {
+          auto choice = CreateFullReturnJson(
+              GenerateRandomString(20), model, to_send, "_", prompt_tokens,
+              predicted_tokens, Json::Value("stop"), logprobs)["choices"][0];
+          choice["index"] = i;
+          result["choices"].append(choice);
+          result["usage"]["completion_tokens"] = predicted_tokens;
+          result["usage"]["prompt_tokens"] = prompt_tokens;
+          result["usage"]["total_tokens"] = predicted_tokens + prompt_tokens;
+        }
+
+        if (i == n - 1) {
+          Json::Value status;
+          status["is_done"] = true;
+          status["has_error"] = false;
+          status["is_stream"] = false;
+          status["status_code"] = 200;
+          callback(std::move(status), std::move(result));
+        }
+      } else {
+        CTL_WRN("Error: " << response.error());
+        Json::Value status;
+        status["is_done"] = true;
+        status["has_error"] = true;
+        status["is_stream"] = false;
+        status["status_code"] = 500;
+        callback(std::move(status), std::move(response.value()));
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace cortex::local
diff --git a/engine/extensions/local-engine/local_engine.h b/engine/extensions/local-engine/local_engine.h
new file mode 100644
index 000000000..6dd970799
--- /dev/null
+++ b/engine/extensions/local-engine/local_engine.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "cortex-common/EngineI.h"
+#include "json/json.h"
+#include "services/engine_service.h"
+#include "utils/process/utils.h"
+#include "utils/task_queue.h"
+
+namespace cortex::local {
+using http_callback = std::function<void(Json::Value&&, Json::Value&&)>;
+
+struct ServerAddress {
+  std::string host;
+  int port;
+  cortex::process::ProcessInfo process_info;
+  std::string pre_prompt;
+  std::string user_prompt;
+  std::string ai_prompt;
+  std::string system_prompt;
+  uint64_t start_time;
+};
+
+class LocalEngine : public EngineI {
+ public:
+  LocalEngine(EngineService& engine_service, TaskQueue& q)
+      : engine_service_(engine_service), q_(q) {}
+  ~LocalEngine();
+
+  void Load(EngineLoadOption opts) final {}
+
+  void Unload(EngineUnloadOption opts) final {}
+
+  void HandleChatCompletion(std::shared_ptr<Json::Value> json_body,
+                            http_callback&& callback) final;
+  void HandleEmbedding(std::shared_ptr<Json::Value> json_body,
+                       http_callback&& callback) final;
+  void LoadModel(std::shared_ptr<Json::Value> json_body,
+                 http_callback&& callback) final;
+  void UnloadModel(std::shared_ptr<Json::Value> json_body,
+                   http_callback&& callback) final;
+  void GetModelStatus(std::shared_ptr<Json::Value> json_body,
+                      http_callback&& callback) final;
+
+  // Get list of running models
+  void GetModels(std::shared_ptr<Json::Value> jsonBody,
+                 http_callback&& callback) final;
+
+  bool SetFileLogger(int max_log_lines, const std::string& log_path) final {
+    return true;
+  }
+  void SetLogLevel(trantor::Logger::LogLevel logLevel) final {}
+
+  // Stop inflight chat completion in stream mode
+  void StopInferencing(const std::string& model_id) final {}
+
+ private:
+  void HandleOpenAiChatCompletion(std::shared_ptr<Json::Value> json_body,
+                                  http_callback&& callback,
+                                  const std::string& model);
+
+  void HandleNonOpenAiChatCompletion(std::shared_ptr<Json::Value> json_body,
+                                     http_callback&& callback,
+                                     const std::string& model);
+
+ private:
+  std::unordered_map<std::string, ServerAddress> server_map_;
+  EngineService& engine_service_;
+  TaskQueue& q_;
+};
+
+}  // namespace cortex::local
diff --git a/engine/main.cc b/engine/main.cc
index ab4e74857..abde0441b 100644
--- a/engine/main.cc
+++ b/engine/main.cc
@@ -196,15 +196,16 @@ void RunServer(bool ignore_cout) {
   auto config_service = std::make_shared<ConfigService>();
   auto download_service =
       std::make_shared<DownloadService>(event_queue_ptr, config_service);
+  auto task_queue = std::make_shared<cortex::TaskQueue>(
+      std::min(2u, std::thread::hardware_concurrency()), "background_task");
   auto engine_service = std::make_shared<EngineService>(
-      download_service, dylib_path_manager, db_service);
+      download_service, dylib_path_manager, db_service, task_queue);
   auto inference_svc = std::make_shared<InferenceService>(engine_service);
   auto model_src_svc = std::make_shared<ModelSourceService>(db_service);
-  cortex::TaskQueue task_queue(
-      std::min(2u, std::thread::hardware_concurrency()), "background_task");
-  auto model_service =
-      std::make_shared<ModelService>(db_service, hw_service, download_service,
-                                     inference_svc, engine_service, task_queue);
+
+  auto model_service = std::make_shared<ModelService>(
+      db_service, hw_service, download_service, inference_svc, engine_service,
+      *task_queue);
   inference_svc->SetModelService(model_service);
 
   auto file_watcher_srv = std::make_shared<FileWatcherService>(
diff --git a/engine/repositories/file_fs_repository.cc b/engine/repositories/file_fs_repository.cc
index f5b349f45..67c0981ba 100644
--- a/engine/repositories/file_fs_repository.cc
+++ b/engine/repositories/file_fs_repository.cc
@@ -18,14 +18,10 @@ std::filesystem::path SanitizePath(const std::filesystem::path& user_input,
   std::filesystem::path resolved_path = std::filesystem::weakly_canonical(
       std::filesystem::path(basedir) / std::filesystem::path(user_input));
   /* Ensure the resolved path is within our basedir */
-  for (auto p = resolved_path; !p.empty(); p = p.parent_path()) {
-    if (std::filesystem::equivalent(p, abs_base)) {
-      return resolved_path;
-    }
-    if (p == p.parent_path()) {  // reached the root directory
-      break;
-    }
+  if (resolved_path.string().find(abs_base.string()) != std::string::npos) {
+    return resolved_path;
   }
+
   return {};
 }
 
diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc
index 48cc6ff37..89cd00058 100644
--- a/engine/services/engine_service.cc
+++ b/engine/services/engine_service.cc
@@ -9,6 +9,7 @@
 #include "config/model_config.h"
 #include "database/engines.h"
 #include "database/models.h"
+#include "extensions/local-engine/local_engine.h"
 #include "extensions/remote-engine/remote_engine.h"
 
 #include "utils/archive_utils.h"
@@ -16,6 +17,7 @@
 #include "utils/engine_matcher_utils.h"
 #include "utils/file_manager_utils.h"
 #include "utils/github_release_utils.h"
+#include "utils/hardware/os_info.h"
 #include "utils/logging_utils.h"
 #include "utils/normalize_engine.h"
 #include "utils/result.hpp"
@@ -46,13 +48,6 @@ std::string Repo2Engine(const std::string& r) {
   }
   return r;
 };
-
-std::string GetEnginePath(std::string_view e) {
-  if (e == kLlamaRepo) {
-    return kLlamaLibPath;
-  }
-  return kLlamaLibPath;
-};
 }  // namespace
 
 cpp::result<void, std::string> EngineService::InstallEngineAsync(
@@ -236,11 +231,14 @@ cpp::result<void, std::string> EngineService::DownloadEngine(
     auto latest_version_semantic = normalized_version == "latest"
                                        ? res.value()[0].version
                                        : normalized_version;
-    auto merged_variant_name = engine + "-" + latest_version_semantic + "-" +
-                               variant_name.value() + ".tar.gz";
+    std::unordered_set<std::string> merged_variant_name = {
+        "llama-" + latest_version_semantic + "-bin-" + variant_name.value() +
+            ".tar.gz",  // menlo
+        "llama-" + latest_version_semantic + "-bin-" + variant_name.value() +
+            ".zip"};  // ggml
 
     for (const auto& asset : res.value()) {
-      if (asset.name == merged_variant_name) {
+      if (merged_variant_name.find(asset.name) != merged_variant_name.end()) {
         selected_variant = asset;
         break;
       }
@@ -275,43 +273,96 @@ cpp::result<void, std::string> EngineService::DownloadEngine(
     }
   }
 
-  auto normalize_version = "v" + selected_variant->version;
   auto variant_folder_name = engine_matcher_utils::GetVariantFromNameAndVersion(
       selected_variant->name, engine, selected_variant->version);
   auto variant_folder_path = file_manager_utils::GetEnginesContainerPath() /
                              engine / variant_folder_name.value() /
-                             normalize_version;
+                             selected_variant->version;
   auto variant_path = variant_folder_path / selected_variant->name;
 
   std::filesystem::create_directories(variant_folder_path);
 
   CTL_INF("variant_folder_path: " + variant_folder_path.string());
-  auto on_finished = [this, engine, selected_variant, variant_folder_path,
-                      normalize_version](const DownloadTask& finishedTask) {
+  auto on_finished = [this, engine, selected_variant,
+                      variant_folder_path](const DownloadTask& finishedTask) {
     // try to unzip the downloaded file
     CTL_INF("Engine zip path: " << finishedTask.items[0].localPath.string());
-    CTL_INF("Version: " + normalize_version);
+    CTL_INF("Version: " + selected_variant->version);
 
     auto extract_path = finishedTask.items[0].localPath.parent_path();
     archive_utils::ExtractArchive(finishedTask.items[0].localPath.string(),
                                   extract_path.string(), true);
-
+    CTL_INF("local path: " << finishedTask.items[0].localPath.string()
+                           << ", extract path: " << extract_path.string());
     auto variant = engine_matcher_utils::GetVariantFromNameAndVersion(
-        selected_variant->name, engine, normalize_version);
-
+        selected_variant->name, engine, selected_variant->version);
     CTL_INF("Extracted variant: " + variant.value());
-    // set as default
+    try {
+      // Create version file
+      std::ofstream meta(extract_path / "version.txt", std::ios::out);
+      meta << "name: " << variant.value() << std::endl;
+      meta << "version: " << selected_variant->version << std::endl;
+      meta.close();
+
+      std::filesystem::path bin_path = extract_path / "build" / "bin";
+      if (std::filesystem::exists(bin_path)) {
+        for (const auto& entry :
+             std::filesystem::directory_iterator(bin_path)) {
+          if (entry.is_regular_file()) {
+            std::filesystem::path target_file =
+                extract_path / entry.path().filename();
+            std::filesystem::copy_file(
+                entry.path(), target_file,
+                std::filesystem::copy_options::overwrite_existing);
+          }
+        }
+        std::filesystem::remove_all(bin_path.parent_path());
+      }
+      if (!std::filesystem::exists(extract_path.parent_path().parent_path() /
+                                   "deps")) {
+        std::filesystem::create_directory(
+            extract_path.parent_path().parent_path() / "deps");
+      }
+      std::filesystem::permissions(extract_path / kLlamaServer,
+                                   std::filesystem::perms::owner_exec |
+                                       std::filesystem::perms::group_exec |
+                                       std::filesystem::perms::others_exec,
+                                   std::filesystem::perm_options::add);
+
+      const std::vector<std::string> windows_deps = {
+          "msvcp140.dll", "vcruntime140.dll", "vcruntime140_1.dll"};
+      for (auto const& win_dep : windows_deps) {
+        if (std::filesystem::exists(
+                file_manager_utils::GetExecutableFolderContainerPath() /
+                win_dep)) {
+          CTL_INF("Copy file "
+                  << (file_manager_utils::GetExecutableFolderContainerPath() /
+                      win_dep)
+                         .string()
+                  << " to " << extract_path.string());
+          std::filesystem::copy_file(
+              file_manager_utils::GetExecutableFolderContainerPath() / win_dep,
+              extract_path / win_dep,
+              std::filesystem::copy_options::overwrite_existing);
+        }
+      }
+
+    } catch (const std::exception& e) {
+      CTL_INF(e.what());
+    }
 
-    auto res =
-        SetDefaultEngineVariant(engine, normalize_version, variant.value());
+    // set as default
+    auto res = SetDefaultEngineVariant(engine, selected_variant->version,
+                                       variant.value());
     if (res.has_error()) {
       CTL_ERR("Failed to set default engine variant: " << res.error());
     } else {
       CTL_INF("Set default engine variant: " << res.value().variant);
     }
-    auto create_res = EngineService::UpsertEngine(
-        engine,  // engine_name
-        kLocal, "", "", normalize_version, variant.value(), "Default", "");
+    auto create_res =
+        EngineService::UpsertEngine(engine,  // engine_name
+                                    kLocal, "", "", selected_variant->version,
+                                    variant.value(), "Default", "");
 
     if (create_res.has_error()) {
       CTL_ERR("Failed to create engine entry: " << create_res->engine_name);
@@ -322,7 +373,7 @@ cpp::result<void, std::string> EngineService::DownloadEngine(
     for (const auto& entry : std::filesystem::directory_iterator(
              variant_folder_path.parent_path())) {
       if (entry.is_directory() &&
-          entry.path().filename() != normalize_version) {
+          entry.path().filename() != selected_variant->version) {
         try {
           std::filesystem::remove_all(entry.path());
         } catch (const std::exception& e) {
@@ -450,7 +501,26 @@ std::string EngineService::GetMatchedVariant(
 cpp::result<std::vector<EngineService::EngineRelease>, std::string>
 EngineService::GetEngineReleases(const std::string& engine) const {
   auto ne = cortex::engine::NormalizeEngine(engine);
-  return github_release_utils::GetReleases("menloresearch", ne);
+  auto ggml_org = github_release_utils::GetReleases(kGgmlOrg, ne);
+  auto menlo = github_release_utils::GetReleases(kMenloOrg, ne);
+  if (ggml_org.has_error() && menlo.has_error()) {
+    return cpp::fail(ggml_org.error());
+  }
+  auto comparator = [](const EngineService::EngineRelease& e1,
+                       const EngineService::EngineRelease& e2) {
+    return e1.name > e2.name;
+  };
+  std::set<EngineService::EngineRelease, decltype(comparator)> s(comparator);
+  if (ggml_org.has_value()) {
+    s.insert(ggml_org.value().begin(), ggml_org.value().end());
+  }
+
+  if (menlo.has_value()) {
+    s.insert(menlo.value().begin(), menlo.value().end());
+  }
+  std::vector<EngineService::EngineRelease> res;
+  std::copy(s.begin(), s.end(), std::back_inserter(res));
+  return res;
 }
 
 cpp::result<std::vector<EngineService::EngineVariant>, std::string>
@@ -458,16 +528,85 @@ EngineService::GetEngineVariants(const std::string& engine,
                                  const std::string& version,
                                  bool filter_compatible_only) const {
   auto ne = cortex::engine::NormalizeEngine(engine);
-  auto engine_release =
-      github_release_utils::GetReleaseByVersion("menloresearch", ne, version);
+  auto engine_release_menlo =
+      github_release_utils::GetReleaseByVersion(kMenloOrg, ne, version);
+  auto engine_release_ggml =
+      github_release_utils::GetReleaseByVersion(kGgmlOrg, ne, version);
+
+  if (engine_release_menlo.has_error() && engine_release_ggml.has_error()) {
+    return cpp::fail("Failed to get engine release: " +
+                     engine_release_menlo.error());
+  }
+  if (engine_release_menlo.has_error()) {
+    CTL_WRN("Failed to get engine release: " << engine_release_menlo.error());
+  }
 
-  if (engine_release.has_error()) {
-    return cpp::fail("Failed to get engine release: " + engine_release.error());
+  if (engine_release_ggml.has_error()) {
+    CTL_WRN("Failed to get engine release: " << engine_release_ggml.error());
   }
 
   std::vector<EngineVariant> compatible_variants;
-  for (const auto& variant : engine_release.value().assets) {
-    if (variant.content_type != "application/gzip") {
+  std::vector<github_release_utils::GitHubAsset> assets;
+
+  auto get_os_major = []() -> int {
+    auto os_info = cortex::hw::GetOSInfo();
+    // Get os major version
+    size_t dot_pos = os_info.version.find_first_of(".");
+    if (dot_pos != std::string::npos) {
+      try {
+        return std::stoi(os_info.version.substr(0, dot_pos));
+      } catch (const std::exception& e) {
+        return 0;
+      }
+    } else {
+      // No version found
+      return 0;
+    }
+  };
+
+  if (engine_release_menlo.has_value()) {
+    // In case of macos, if os version is 12, we get binary from menlo
+    std::copy_if(
+        engine_release_menlo.value().assets.begin(),
+        engine_release_menlo.value().assets.end(), std::back_inserter(assets),
+        [get_os_major](const github_release_utils::GitHubAsset& assets) {
+#if defined(__APPLE__) && defined(__MACH__)
+          if ((assets.name.find(kMacOs) == std::string::npos) ||
+              (get_os_major() <= 12 &&
+               assets.name.find(kMacOs) != std::string::npos)) {
+            return true;
+          }
+          return false;
+#else
+          return true;
+#endif
+        });
+  }
+
+  if (engine_release_ggml.has_value()) {
+    // In case of macos, if os version is 12, we get binary from menlo
+    std::copy_if(
+        engine_release_ggml.value().assets.begin(),
+        engine_release_ggml.value().assets.end(), std::back_inserter(assets),
+        [get_os_major](const github_release_utils::GitHubAsset& assets) {
+#if defined(__APPLE__) && defined(__MACH__)
+          if ((assets.name.find(kMacOs) == std::string::npos) ||
+              (get_os_major() > 12 &&
+               assets.name.find(kMacOs) != std::string::npos)) {
+            return true;
+          }
+          return false;
+#else
+          return true;
+#endif
+        });
+  }
+
+  for (const auto& variant : assets) {
+    CTL_INF("content_type: " << variant.content_type
+                             << ", name: " << variant.name);
+    if (variant.content_type != "application/gzip" &&
+        variant.content_type != "application/json; charset=utf-8") {
       continue;
     }
     if (variant.state != "uploaded") {
@@ -494,30 +633,29 @@ EngineService::GetEngineVariants(const std::string& engine,
                              name.find("mac") != std::string::npos)
                            os_match = true;
                          if (system_info->os == "windows" &&
-                             name.find("windows") != std::string::npos)
+                             name.find("win") != std::string::npos)
                            os_match = true;
                          if (system_info->os == "linux" &&
-                             name.find("linux") != std::string::npos)
+                             (name.find("linux") != std::string::npos ||
+                              name.find("ubuntu") != std::string::npos))
                            os_match = true;
 
                          bool arch_match = false;
                          if (system_info->arch == "arm64" &&
                              name.find("arm64") != std::string::npos)
                            arch_match = true;
-                         if (system_info->arch == "amd64" &&
-                             name.find("amd64") != std::string::npos)
+                         if (system_info->arch == "x64" &&
+                             name.find("x64") != std::string::npos)
                            arch_match = true;
 
                          return !(os_match && arch_match);
                        }),
         compatible_variants.end());
-
     if (compatible_variants.empty()) {
       return cpp::fail("No compatible variants found for system " +
                        system_info->os + "/" + system_info->arch);
     }
   }
-
   return compatible_variants;
 }
 
@@ -550,7 +688,7 @@ EngineService::SetDefaultEngineVariant(const std::string& engine,
   auto normalized_version = string_utils::RemoveSubstring(version, "v");
 
   auto config = file_manager_utils::GetCortexConfig();
-  config.llamacppVersion = "v" + normalized_version;
+  config.llamacppVersion = normalized_version;
   config.llamacppVariant = variant;
   auto result = file_manager_utils::UpdateCortexConfig(config);
   if (result.has_error()) {
@@ -574,10 +712,10 @@ cpp::result<bool, std::string> EngineService::IsEngineVariantReady(
     return cpp::fail(installed_engines.error());
   }
 
-  CLI_LOG("IsEngineVariantReady: " << ne << ", " << normalized_version << ", "
+  CTL_INF("IsEngineVariantReady: " << ne << ", " << normalized_version << ", "
                                    << variant);
   for (const auto& installed_engine : installed_engines.value()) {
-    CLI_LOG("Installed: name: " + installed_engine.name +
+    CTL_INF("Installed: name: " + installed_engine.name +
             ", version: " + installed_engine.version);
     if ((installed_engine.name == variant &&
          installed_engine.version == normalized_version) ||
@@ -640,10 +778,10 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const {
         try {
           auto node = YAML::LoadFile(version_txt_path.string());
           auto ev = EngineVariantResponse{
-              node["name"].as<std::string>(),           // name
-              "v" + node["version"].as<std::string>(),  // version
-              engine,                                   // engine
-              "",                                       // type
+              node["name"].as<std::string>(),     // name
+              node["version"].as<std::string>(),  // version
+              engine,                             // engine
+              "",                                 // type
           };
           variants.push_back(ev);
         } catch (const YAML::Exception& e) {
@@ -696,76 +834,18 @@ cpp::result<void, std::string> EngineService::LoadEngine(
     }
     return {};
   }
-
-  // End hard code
-
-  CTL_INF("Loading engine: " << ne);
+  if (engines_.find(ne) == engines_.end()) {
+    CTL_INF("Loading local engine: " << engine_name);
 #if defined(_WIN32) || defined(_WIN64) || defined(__linux__)
-  CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string());
+    CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string());
 #endif
-
-  auto engine_dir_path_res = GetEngineDirPath(ne);
-  if (engine_dir_path_res.has_error()) {
-    return cpp::fail(engine_dir_path_res.error());
+    engines_[ne].engine = new cortex::local::LocalEngine(*this, *(q_.get()));
+    CTL_INF("Loaded engine: " << engine_name);
+  } else {
+    CTL_INF("Engine has already been loaded: " << engine_name);
   }
-  auto engine_dir_path = engine_dir_path_res.value().first;
-  auto custom_engine_path = engine_dir_path_res.value().second;
-
-  try {
-    auto cuda_path = file_manager_utils::GetCudaToolkitPath(ne);
-
-#if defined(_WIN32) || defined(_WIN64)
-    // register deps
-    if (!(getenv("ENGINE_PATH"))) {
-      std::vector<std::filesystem::path> paths{};
-      paths.push_back(cuda_path);
-      paths.push_back(engine_dir_path);
-
-      CTL_DBG("Registering dylib for "
-              << ne << " with " << std::to_string(paths.size()) << " paths.");
-      for (const auto& path : paths) {
-        CTL_DBG("Registering path: " << path.string());
-      }
-
-      auto reg_result = dylib_path_manager_->RegisterPath(ne, paths);
-      if (reg_result.has_error()) {
-        CTL_DBG("Failed register lib paths for: " << ne);
-      } else {
-        CTL_DBG("Registered lib paths for: " << ne);
-      }
-    }
-#endif
 
-    auto dylib =
-        std::make_unique<cortex_cpp::dylib>(engine_dir_path.string(), "engine");
-
-    auto config = file_manager_utils::GetCortexConfig();
-    auto log_path = std::filesystem::path(config.logFolderPath) /
-                    std::filesystem::path(config.logLlamaCppPath);
-
-    // init
-    auto func = dylib->get_function<EngineI*()>("get_engine");
-    auto engine_obj = func();
-    auto load_opts = EngineI::EngineLoadOption{
-        /* .engine_path = */ engine_dir_path,
-        /* .deps_path = */ cuda_path,
-        /* .is_custom_engine_path = */ custom_engine_path,
-        /* .log_path = */ log_path,
-        /* .max_log_lines = */ config.maxLogLines,
-        /* .log_level = */ logging_utils_helper::global_log_level,
-    };
-    engine_obj->Load(load_opts);
-
-    engines_[ne].engine = engine_obj;
-    engines_[ne].dl = std::move(dylib);
-
-    CTL_DBG("Engine loaded: " << ne);
-    return {};
-  } catch (const cortex_cpp::dylib::load_error& e) {
-    CTL_ERR("Could not load engine: " << e.what());
-    engines_.erase(ne);
-    return cpp::fail("Could not load engine " + ne + ": " + e.what());
-  }
+  return {};
 }
 
 void EngineService::RegisterEngineLibPath() {
@@ -796,7 +876,8 @@ void EngineService::RegisterEngineLibPath() {
 
       auto reg_result = dylib_path_manager_->RegisterPath(ne, paths);
       if (reg_result.has_error()) {
-        CTL_WRN("Failed register lib path for " << engine);
+        CTL_WRN("Failed register lib path for "
+                << engine << ", error: " << reg_result.error());
       } else {
         CTL_DBG("Registered lib path for " << engine);
       }
@@ -829,8 +910,8 @@ EngineService::GetEngineDirPath(const std::string& engine_name) {
   CTL_DBG("user defined engine path: " << user_defined_engine_path);
   const std::filesystem::path engine_dir_path = [&] {
     if (user_defined_engine_path != nullptr) {
-      return std::filesystem::path(user_defined_engine_path) /
-             GetEnginePath(ne) / selected_engine_variant->variant /
+      return std::filesystem::path(user_defined_engine_path) / kLlamaLibPath /
+             selected_engine_variant->variant /
              selected_engine_variant->version;
     } else {
       return file_manager_utils::GetEnginesContainerPath() / ne /
@@ -891,8 +972,7 @@ std::vector<EngineV> EngineService::GetLoadedEngines() {
 cpp::result<github_release_utils::GitHubRelease, std::string>
 EngineService::GetLatestEngineVersion(const std::string& engine) const {
   auto ne = cortex::engine::NormalizeEngine(engine);
-  auto res =
-      github_release_utils::GetReleaseByVersion("menloresearch", ne, "latest");
+  auto res = github_release_utils::GetReleaseByVersion(kMenloOrg, ne, "latest");
   if (res.has_error()) {
     return cpp::fail("Failed to fetch engine " + engine + " latest version!");
   }
diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h
index 7e6be74c5..0be1fff64 100644
--- a/engine/services/engine_service.h
+++ b/engine/services/engine_service.h
@@ -19,6 +19,7 @@
 #include "utils/github_release_utils.h"
 #include "utils/result.hpp"
 #include "utils/system_info_utils.h"
+#include "utils/task_queue.h"
 
 struct EngineUpdateResult {
   std::string engine;
@@ -44,7 +45,6 @@ class EngineService : public EngineServiceI {
   using EngineVariant = github_release_utils::GitHubAsset;
 
   struct EngineInfo {
-    std::unique_ptr<cortex_cpp::dylib> dl;
     EngineV engine;
   };
 
@@ -60,12 +60,13 @@ class EngineService : public EngineServiceI {
   };
   HardwareInfo hw_inf_;
   std::shared_ptr<DatabaseService> db_service_ = nullptr;
+  std::shared_ptr<cortex::TaskQueue> q_ = nullptr;
 
  public:
-  explicit EngineService(
-      std::shared_ptr<DownloadService> download_service,
-      std::shared_ptr<cortex::DylibPathManager> dylib_path_manager,
-      std::shared_ptr<DatabaseService> db_service)
+  EngineService(std::shared_ptr<DownloadService> download_service,
+                std::shared_ptr<cortex::DylibPathManager> dylib_path_manager,
+                std::shared_ptr<DatabaseService> db_service,
+                std::shared_ptr<cortex::TaskQueue> q)
       : download_service_{download_service},
         dylib_path_manager_{dylib_path_manager},
         hw_inf_{
@@ -74,9 +75,17 @@ class EngineService : public EngineServiceI {
             system_info_utils::GetDriverAndCudaVersion()
                 .second  //  cuda_driver_version.
         },
+        db_service_(db_service),
+        q_(q) {}
 
-        db_service_(db_service) {}
-
+  EngineService(std::shared_ptr<cortex::DylibPathManager> dylib_path_manager)
+      : dylib_path_manager_(dylib_path_manager),
+        hw_inf_{
+            system_info_utils::GetSystemInfo(),  // sys_inf.
+            {},                                  // cpu_info.
+            system_info_utils::GetDriverAndCudaVersion()
+                .second  //  cuda_driver_version.
+        } {}
   std::vector<EngineInfo> GetEngineInfoList() const;
 
   /**
@@ -159,6 +168,9 @@ class EngineService : public EngineServiceI {
 
   bool IsRemoteEngine(const std::string& engine_name) const override;
 
+  cpp::result<std::pair<std::filesystem::path, bool>, std::string>
+  GetEngineDirPath(const std::string& engine_name);
+
  private:
   bool IsEngineLoaded(const std::string& engine);
 
@@ -172,9 +184,6 @@ class EngineService : public EngineServiceI {
   std::string GetMatchedVariant(const std::string& engine,
                                 const std::vector<std::string>& variants);
 
-  cpp::result<std::pair<std::filesystem::path, bool>, std::string>
-  GetEngineDirPath(const std::string& engine_name);
-
   cpp::result<bool, std::string> IsEngineVariantReady(
       const std::string& engine, const std::string& version,
       const std::string& variant);
diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc
index f0ccadb28..fb2f841be 100644
--- a/engine/services/hardware_service.cc
+++ b/engine/services/hardware_service.cc
@@ -203,10 +203,8 @@ bool HardwareService::Restart(const std::string& host, int port) {
 #else
   std::vector<std::string> commands;
   // Some engines requires to add lib search path before process being created
-  auto download_srv = std::make_shared<DownloadService>();
-  auto dylib_path_mng = std::make_shared<cortex::DylibPathManager>();
-  auto db_srv = std::make_shared<DatabaseService>();
-  EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath();
+  EngineService(std::make_shared<cortex::DylibPathManager>())
+      .RegisterEngineLibPath();
   std::string p = cortex_utils::GetCurrentPath() / exe;
   commands.push_back(p);
   commands.push_back("--ignore_cout");
diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc
index a1646495b..75d95f06d 100644
--- a/engine/services/inference_service.cc
+++ b/engine/services/inference_service.cc
@@ -12,7 +12,9 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
   } else {
     engine_type = (*(json_body)).get("engine", kLlamaRepo).asString();
   }
+  CTL_DBG("engine_type: " << engine_type);
   function_calling_utils::PreprocessRequest(json_body);
+  CTL_DBG("engine_type: " << engine_type);
   auto tool_choice = json_body->get("tool_choice", Json::Value::null);
   auto model_id = json_body->get("model", "").asString();
   if (saved_models_.find(model_id) != saved_models_.end()) {
@@ -32,6 +34,7 @@ cpp::result<void, InferResult> InferenceService::HandleChatCompletion(
       }
     }
   }
+  CTL_DBG("engine_type: " << engine_type);
 
   auto engine_result = engine_service_->GetLoadedEngine(engine_type);
   if (engine_result.has_error()) {
@@ -275,9 +278,7 @@ InferResult InferenceService::GetModels(
   for (const auto& loaded_engine : loaded_engines) {
     if (std::holds_alternative<EngineI*>(loaded_engine)) {
       auto e = std::get<EngineI*>(loaded_engine);
-      if (e->IsSupported("GetModels")) {
-        e->GetModels(json_body, std::move(cb));
-      }
+      e->GetModels(json_body, std::move(cb));
     } else {
       std::get<RemoteEngineI*>(loaded_engine)
           ->GetModels(json_body, std::move(cb));
@@ -302,10 +303,8 @@ bool InferenceService::StopInferencing(const std::string& engine_name,
 
   if (std::holds_alternative<EngineI*>(engine_result.value())) {
     auto engine = std::get<EngineI*>(engine_result.value());
-    if (engine->IsSupported("StopInferencing")) {
-      engine->StopInferencing(model_id);
-      CTL_INF("Stopped inferencing");
-    }
+    engine->StopInferencing(model_id);
+    CTL_INF("Stopped inferencing");
   }
   return true;
 }
diff --git a/engine/test/components/test_engine_matcher_utils.cc b/engine/test/components/test_engine_matcher_utils.cc
index 1d1ed47a8..2c24a9b6f 100644
--- a/engine/test/components/test_engine_matcher_utils.cc
+++ b/engine/test/components/test_engine_matcher_utils.cc
@@ -6,125 +6,78 @@
 class EngineMatcherUtilsTestSuite : public ::testing::Test {
  protected:
   const std::vector<std::string> cortex_llamacpp_variants{
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-vulkan.tar.gz",
-      "cortex.llamacpp-0.1.43-linux-arm64.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-mac-arm64.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx-cuda-11-7.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx-cuda-12-0.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx.tar.gz",
-      "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-vulkan.tar.gz",
+      "llama-b4920-bin-ubuntu-arm64.zip",
+      "llama-b4920-bin-linux-avx-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-linux-avx-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-linux-avx-x64.tar.gz",
+      "llama-b4920-bin-linux-avx2-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-linux-avx2-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-ubuntu-x64.tar.gz",
+      "llama-b4920-bin-linux-avx512-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-linux-avx512-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-linux-avx512-x64.tar.gz",
+      "llama-b4920-bin-linux-noavx-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-linux-noavx-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-linux-noavx-x64.tar.gz",
+      "llama-b4920-bin-ubuntu-vulkan-x64.tar.gz",
+      "llama-b4920-bin-macos-arm64.zip",
+      "llama-b4920-bin-macos-x64.zip",
+      "llama-b4920-bin-win-avx-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-win-avx-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-win-avx-x64.zip",
+      "llama-b4920-bin-win-avx2-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-win-avx2-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-win-avx2-x64.zip",
+      "llama-b4920-bin-win-avx512-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-win-avx512-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-win-avx512-x64.zip",
+      "llama-b4920-bin-win-noavx-cuda-cu11.7-x64.tar.gz",
+      "llama-b4920-bin-win-noavx-cuda-cu12.0-x64.tar.gz",
+      "llama-b4920-bin-win-noavx-x64.zip",
+      "llama-b4920-bin-win-vulkan-x64.zip",
   };
-
-  const std::vector<std::string> cortex_tensorrt_variants{
-      "cortex.tensorrt-llm-0.0.9-linux-cuda-12-4.tar.gz",
-      "cortex.tensorrt-llm-0.0.9-windows-cuda-12-4.tar.gz"};
-
-  const std::vector<std::string> cortex_onnx_variants{
-      "cortex.onnx-0.1.7-windows-amd64.tar.gz"};
 };
 
-TEST_F(EngineMatcherUtilsTestSuite, TestValidateOnnx) {
-
-  {
-    auto expect_matched_variant = cortex_onnx_variants[0];
-    auto result = engine_matcher_utils::ValidateOnnx(cortex_onnx_variants,
-                                                     "windows", "amd64");
-
-    EXPECT_EQ(result, expect_matched_variant);
-  }
-
-  {
-    // should return an empty variant because no variant matched
-    auto expect_matched_variant{""};
-    auto windows_arm_result = engine_matcher_utils::ValidateOnnx(
-        cortex_onnx_variants, "windows", "arm");
-    auto mac_arm64_result = engine_matcher_utils::ValidateOnnx(
-        cortex_onnx_variants, "mac", "arm64");
-
-    EXPECT_EQ(windows_arm_result, expect_matched_variant);
-    EXPECT_EQ(mac_arm64_result, expect_matched_variant);
-  }
-}
-
-TEST_F(EngineMatcherUtilsTestSuite, TestValidateTensorrt) {
-
+TEST_F(EngineMatcherUtilsTestSuite, TestValidate) {
   {
-    auto windows_expect_matched_variant{cortex_tensorrt_variants[1]};
-    auto linux_expect_matched_variant{cortex_tensorrt_variants[0]};
-    auto windows{"windows"};
-    auto linux{"linux"};
+    auto os{"win"};
+    auto cpu_arch{"x64"};
+    auto suitable_avx{"avx2"};
     auto cuda_version{"12.4"};
-    auto windows_result = engine_matcher_utils::ValidateTensorrtLlm(
-        cortex_tensorrt_variants, windows, cuda_version);
-    auto linux_result = engine_matcher_utils::ValidateTensorrtLlm(
-        cortex_tensorrt_variants, linux, cuda_version);
 
-    EXPECT_EQ(windows_result, windows_expect_matched_variant);
-    EXPECT_EQ(linux_result, linux_expect_matched_variant);
-  }
-
-  {  // macos is not supported
-    auto os = "mac";
-    auto cuda_version{"12.4"};
+    auto variant = engine_matcher_utils::Validate(
+        cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version);
 
-    auto result = engine_matcher_utils::ValidateTensorrtLlm(
-        cortex_tensorrt_variants, os, cuda_version);
-    EXPECT_EQ(result, "");
+    EXPECT_EQ(variant, "llama-b4920-bin-win-avx2-cuda-cu12.0-x64.tar.gz");
   }
-}
 
-TEST_F(EngineMatcherUtilsTestSuite, TestValidate) {
   {
-    auto os{"windows"};
-    auto cpu_arch{"amd64"};
-    auto suitable_avx{"avx2"};
-    auto cuda_version{"12.4"};
+    auto os{"mac"};
+    auto cpu_arch{"x64"};
+    auto suitable_avx{""};
+    auto cuda_version{""};
 
     auto variant = engine_matcher_utils::Validate(
         cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version);
 
-    EXPECT_EQ(
-        variant,
-        "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-12-0.tar.gz");
+    EXPECT_EQ(variant, "llama-b4920-bin-macos-x64.zip");
   }
 
   {
     auto os{"mac"};
-    auto cpu_arch{"amd64"};
+    auto cpu_arch{"arm64"};
     auto suitable_avx{""};
     auto cuda_version{""};
 
     auto variant = engine_matcher_utils::Validate(
         cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version);
 
-    EXPECT_EQ(variant, "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz");
+    EXPECT_EQ(variant, "llama-b4920-bin-macos-arm64.zip");
   }
 
   {
-    auto os{"windows"};
-    auto cpu_arch{"amd64"};
+    auto os{"win"};
+    auto cpu_arch{"x64"};
     auto suitable_avx{"avx2"};
     auto cuda_version{"10"};
 
@@ -132,8 +85,7 @@ TEST_F(EngineMatcherUtilsTestSuite, TestValidate) {
         cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version);
 
     // fallback to no cuda version
-    EXPECT_EQ(variant,
-              "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2.tar.gz");
+    EXPECT_EQ(variant, "llama-b4920-bin-win-avx2-x64.zip");
   }
 
   {
@@ -145,30 +97,43 @@ TEST_F(EngineMatcherUtilsTestSuite, TestValidate) {
     auto variant = engine_matcher_utils::Validate(
         cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version);
 
-    EXPECT_EQ(variant, "cortex.llamacpp-0.1.43-linux-arm64.tar.gz");
+    EXPECT_EQ(variant, "llama-b4920-bin-ubuntu-arm64.zip");
   }
 }
 
 TEST_F(EngineMatcherUtilsTestSuite, TestGetVersionAndArch) {
   {
-    std::string variant =
-        "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-11-7.tar.gz";
+    std::string variant = "llama-b4920-bin-linux-avx-cuda-cu11.7-x64.tar.gz";
+    auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant);
+    EXPECT_EQ(version, "b4920");
+    EXPECT_EQ(arch, "linux-avx-cuda-cu11.7-x64");
+  }
+
+  {
+    std::string variant = "llama-b4920-bin-ubuntu-arm64.zip";
+    auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant);
+    EXPECT_EQ(version, "b4920");
+    EXPECT_EQ(arch, "ubuntu-arm64");
+  }
+
+  {
+    std::string variant = "llama-b4920-bin-win-avx2-x64.zip";
     auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant);
-    EXPECT_EQ(version, "v0.1.25-25.08.24");
-    EXPECT_EQ(arch, "linux-amd64-avx-cuda-11-7");
+    EXPECT_EQ(version, "b4920");
+    EXPECT_EQ(arch, "win-avx2-x64");
   }
 
   {
-    std::string variant = "cortex.llamacpp-0.1.25-windows-amd64-avx2.tar.gz";
+    std::string variant = "llama-b4920-bin-macos-x64.tar.gz";
     auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant);
-    EXPECT_EQ(version, "v0.1.25");
-    EXPECT_EQ(arch, "windows-amd64-avx2");
+    EXPECT_EQ(version, "b4920");
+    EXPECT_EQ(arch, "macos-x64");
   }
 
   {
-    std::string variant = "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz";
+    std::string variant = "llama-b4920-bin-ubuntu-vulkan-x64.zip";
     auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant);
-    EXPECT_EQ(version, "v0.1.25-25.08.24");
-    EXPECT_EQ(arch, "mac-amd64");
+    EXPECT_EQ(version, "b4920");
+    EXPECT_EQ(arch, "ubuntu-vulkan-x64");
   }
 }
diff --git a/engine/test/components/test_github_release_utils.cc b/engine/test/components/test_github_release_utils.cc
index ae1e2c7c2..20c14b187 100644
--- a/engine/test/components/test_github_release_utils.cc
+++ b/engine/test/components/test_github_release_utils.cc
@@ -4,16 +4,16 @@
 class GitHubReleaseUtilsTest : public ::testing::Test {};
 
 TEST_F(GitHubReleaseUtilsTest, AbleToGetReleaseByVersion) {
-  auto version{"v0.1.36"};
+  auto version{"b4920"};
   auto result = github_release_utils::GetReleaseByVersion(
-      "menloresearch", "cortex.llamacpp", version);
+      kMenloOrg, "llama.cpp", version);
 
   ASSERT_TRUE(result.has_value());
   ASSERT_EQ(result->tag_name, version);
 }
 
 TEST_F(GitHubReleaseUtilsTest, AbleToGetReleaseList) {
-  auto result = github_release_utils::GetReleases("menloresearch", "cortex.llamacpp");
+  auto result = github_release_utils::GetReleases(kMenloOrg, "llama.cpp");
 
   ASSERT_TRUE(result.has_value());
   ASSERT_TRUE(result->size() > 0);
diff --git a/engine/test/components/test_string_utils.cc b/engine/test/components/test_string_utils.cc
index 42211b668..e12046136 100644
--- a/engine/test/components/test_string_utils.cc
+++ b/engine/test/components/test_string_utils.cc
@@ -288,6 +288,47 @@ TEST_F(StringUtilsTestSuite, LargeInputPerformance) {
   EXPECT_EQ(RemoveSubstring(large_input, to_remove), "");
 }
 
+TEST(LTrimTest, EmptyString) {
+  std::string s = "";
+  LTrim(s);
+  EXPECT_EQ(s, "");
+}
+
+TEST(LTrimTest, NoSpaces) {
+  std::string s = "HelloWorld";
+  LTrim(s);
+  EXPECT_EQ(s, "HelloWorld");
+}
+
+TEST(LTrimTest, LeadingSpaces) {
+  std::string s = "   HelloWorld";
+  LTrim(s);
+  EXPECT_EQ(s, "HelloWorld");
+}
+
+TEST(LTrimTest, LeadingTabs) {
+  std::string s = "\t\tHelloWorld";
+  LTrim(s);
+  EXPECT_EQ(s, "HelloWorld");
+}
+
+TEST(LTrimTest, LeadingNewlines) {
+  std::string s = "\n\nHelloWorld";
+  LTrim(s);
+  EXPECT_EQ(s, "HelloWorld");
+}
+
+TEST(LTrimTest, OnlySpaces) {
+  std::string s = "   ";
+  LTrim(s);
+  EXPECT_EQ(s, "");
+}
+
+TEST(LTrimTest, MixedSpaces) {
+  std::string s = "   \t\nHelloWorld   ";
+  LTrim(s);
+  EXPECT_EQ(s, "HelloWorld   ");
+}
 
 TEST_F(StringUtilsTestSuite, UrlPaths_SimilarStrings) {
   std::string str1 = "/v1/threads/{1}/messages/{2}";
diff --git a/engine/utils/cuda_toolkit_utils.h b/engine/utils/cuda_toolkit_utils.h
index 748af1bd3..e7aadfdd6 100644
--- a/engine/utils/cuda_toolkit_utils.h
+++ b/engine/utils/cuda_toolkit_utils.h
@@ -7,32 +7,7 @@ inline std::string GetCompatibleCudaToolkitVersion(
     const std::string& driver_semantic_version, const std::string& os,
     const std::string& engine) {
 
-  if (engine == "cortex.tensorrt-llm") {
-    // if the engine is cortex.tensorrt-llm, the minimum required CUDA version is 12.4
-    if (os == "windows") {
-      if (semantic_version_utils::CompareSemanticVersion(
-              driver_semantic_version, "527.41") >= 0) {
-        return "12.4";
-      } else {
-        throw std::runtime_error(
-            "GPU driver version not supported. Minimum "
-            "required driver version is 527.41");
-      }
-    } else if (os == "linux") {
-      if (semantic_version_utils::CompareSemanticVersion(
-              driver_semantic_version, "525.60.13") >= 0) {
-        return "12.4";
-      } else {
-        throw std::runtime_error(
-            "GPU driver version not supported. Minimum required driver version "
-            "is 525.60.13");
-      }
-    } else {
-      throw std::runtime_error("Unsupported OS");
-    }
-  }
-
-  if (os == "windows") {
+  if (os == "windows" || os == "win") {
     if (semantic_version_utils::CompareSemanticVersion(driver_semantic_version,
                                                        "527.41") >= 0) {
       return "12.4";
@@ -44,7 +19,7 @@ inline std::string GetCompatibleCudaToolkitVersion(
           "GPU driver version not supported. Minimum "
           "required driver version is 452.39");
     }
-  } else if (os == "linux") {
+  } else if (os == "linux" || os == "ubuntu") {
     if (semantic_version_utils::CompareSemanticVersion(driver_semantic_version,
                                                        "525.60.13") >= 0) {
       return "12.4";
diff --git a/engine/utils/dylib_path_manager.cc b/engine/utils/dylib_path_manager.cc
index 7c389df06..878620185 100644
--- a/engine/utils/dylib_path_manager.cc
+++ b/engine/utils/dylib_path_manager.cc
@@ -26,7 +26,7 @@ cpp::result<void, std::string> DylibPathManager::RegisterPath(
       }
       return cpp::fail("Failed to add DLL directory: " + path.string());
     } else {
-      CTL_DBG("Added DLL directory: " << path.string());
+      CTL_INF("Added DLL directory: " << path.string());
     }
 
     dylib_paths.push_back({path, cookie});
diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h
index 2c5cd1be3..695afb4c5 100644
--- a/engine/utils/engine_constants.h
+++ b/engine/utils/engine_constants.h
@@ -5,20 +5,23 @@ constexpr const auto kLlamaEngine = "llama-cpp";
 constexpr const auto kRemote = "remote";
 constexpr const auto kLocal = "local";
 
+constexpr const auto kLlamaRepo = "llama.cpp";
+constexpr const auto kLlamaLibPath = "./engines/llama.cpp";
+constexpr const auto kLlamaServer = "llama-server";
 
-constexpr const auto kLlamaRepo = "cortex.llamacpp";
-
-constexpr const auto kLlamaLibPath = "./engines/cortex.llamacpp";
+constexpr const auto kMenloOrg = "menloresearch";
+constexpr const auto kGgmlOrg = "ggml-org";
 
 // other constants
 constexpr auto static kHuggingFaceHost = "huggingface.co";
 constexpr auto static kGitHubHost = "api.github.com";
 constexpr auto static kCortexFolderName = "cortexcpp";
-constexpr auto static kDefaultGHUserAgent = "menloresearch";
+constexpr auto static kDefaultGHUserAgent = kMenloOrg;
 
-constexpr auto static kWindowsOs = "windows";
+constexpr auto static kWindowsOs = "win";
 constexpr auto static kMacOs = "mac";
 constexpr auto static kLinuxOs = "linux";
+constexpr auto static kUbuntuOs = "ubuntu";
 constexpr auto static kUnsupportedOs = "Unsupported OS";
 
 constexpr auto static kCurlGetTimeout = 10;
diff --git a/engine/utils/engine_matcher_utils.h b/engine/utils/engine_matcher_utils.h
index 0b0cb26be..1afdd194c 100644
--- a/engine/utils/engine_matcher_utils.h
+++ b/engine/utils/engine_matcher_utils.h
@@ -7,6 +7,7 @@
 #include <string>
 #include <vector>
 #include "utils/cpuid/cpu_info.h"
+#include "utils/engine_constants.h"
 #include "utils/logging_utils.h"
 #include "utils/result.hpp"
 #include "utils/string_utils.h"
@@ -24,13 +25,19 @@ inline cpp::result<std::string, std::string> GetVariantFromNameAndVersion(
   if (engine.empty()) {
     return cpp::fail("Engine name is empty");
   }
-  auto nv = string_utils::RemoveSubstring(version, "v");
-  using namespace string_utils;
-  auto removed_extension = RemoveSubstring(engine_file_name, ".tar.gz");
-  auto version_and_variant = RemoveSubstring(removed_extension, engine + "-");
-
-  auto variant = RemoveSubstring(version_and_variant, nv + "-");
-  return variant;
+  CTL_DBG("version: " << version);
+  namespace su = string_utils;
+  CTL_DBG("engine_file_name: " << engine_file_name);
+  auto rm_extension_menlo = su::RemoveSubstring(engine_file_name, ".tar.gz");
+  auto rm_extension_ggml = su::RemoveSubstring(rm_extension_menlo, ".zip");
+  CTL_DBG("removed_extension: " << rm_extension_ggml);
+  auto version_and_variant =
+      su::RemoveSubstring(rm_extension_ggml, engine + "-");
+  CTL_DBG("version_and_variant: " << version_and_variant);
+  auto variant = su::RemoveSubstring(version_and_variant, version + "-");
+  auto v = su::RemoveSubstring(variant, "llama-bin-");
+  CTL_DBG("variant: " << v);
+  return v;
 }
 
 inline std::string GetSuitableAvxVariant(cortex::cpuid::CpuInfo& cpu_info) {
@@ -48,7 +55,7 @@ inline std::string GetSuitableAvxVariant(cortex::cpuid::CpuInfo& cpu_info) {
 
 inline std::string GetSuitableCudaVariant(
     const std::vector<std::string>& variants, const std::string& cuda_version) {
-  std::regex cuda_reg("cuda-(\\d+)-(\\d+)");
+  std::regex cuda_reg("cuda-cu(\\d+).(\\d+)");
   std::smatch match;
 
   int requested_major = 0;
@@ -141,8 +148,9 @@ inline std::string Validate(const std::vector<std::string>& variants,
                             const std::string& os, const std::string& cpu_arch,
                             const std::string& suitable_avx,
                             const std::string& cuda_version) {
+  // CTL_INF(os << " " << cpu_arch);
   // Early return if the OS is not supported
-  if (os != "mac" && os != "windows" && os != "linux") {
+  if (os != kMacOs && os != kWindowsOs && os != kLinuxOs) {
     return "";
   }
 
@@ -150,6 +158,12 @@ inline std::string Validate(const std::vector<std::string>& variants,
   std::copy_if(variants.begin(), variants.end(),
                std::back_inserter(os_and_arch_compatible_list),
                [&os, &cpu_arch](const std::string& variant) {
+                 // In case of Linux, we need to include ubuntu version also
+                 if (os == kLinuxOs) {
+                   if (variant.find(kUbuntuOs) != std::string::npos &&
+                       variant.find(cpu_arch) != std::string::npos)
+                     return true;
+                 }
                  auto os_match = "-" + os;
                  auto cpu_arch_match = "-" + cpu_arch;
 
@@ -157,10 +171,10 @@ inline std::string Validate(const std::vector<std::string>& variants,
                         variant.find(cpu_arch_match) != std::string::npos;
                });
 
-  if (os == "mac" && !os_and_arch_compatible_list.empty())
+  if (os == kMacOs && !os_and_arch_compatible_list.empty())
     return os_and_arch_compatible_list[0];
 
-  if (os == "linux" && cpu_arch == "arm64" &&
+  if (os == kLinuxOs && cpu_arch == "arm64" &&
       !os_and_arch_compatible_list.empty()) {
     return os_and_arch_compatible_list[0];
   }
@@ -170,7 +184,14 @@ inline std::string Validate(const std::vector<std::string>& variants,
   std::copy_if(os_and_arch_compatible_list.begin(),
                os_and_arch_compatible_list.end(),
                std::back_inserter(avx_compatible_list),
-               [&suitable_avx](const std::string& variant) {
+               [&os, &cpu_arch, &suitable_avx](const std::string& variant) {
+                 if (os == kLinuxOs &&
+                     (suitable_avx == "avx2" || suitable_avx == "avx512" ||
+                      cpu_arch == "arm64")) {
+                   if (variant.find(std::string(kUbuntuOs) + "-" + cpu_arch) !=
+                       std::string::npos)
+                     return true;
+                 }
                  auto suitable_avx_match = "-" + suitable_avx;
 
                  return variant.find(suitable_avx_match) != std::string::npos;
@@ -185,15 +206,18 @@ inline std::string Validate(const std::vector<std::string>& variants,
 inline std::pair<std::string, std::string> GetVersionAndArch(
     const std::string& file_name) {
   // Remove the file extension
-  std::string base = file_name.substr(0, file_name.find("tar") - 1);
+  std::string b = string_utils::RemoveSubstring(file_name, ".tar.gz");
+  std::string base = string_utils::RemoveSubstring(b, ".zip");
 
   size_t arch_pos = 0;
-  if (base.find("windows") != std::string::npos) {
-    arch_pos = base.find("-windows");
+  if (base.find("win") != std::string::npos) {
+    arch_pos = base.find("-bin-win");
   } else if (base.find("linux") != std::string::npos) {
-    arch_pos = base.find("-linux");
+    arch_pos = base.find("-bin-linux");
+  } else if (base.find("ubuntu") != std::string::npos) {
+    arch_pos = base.find("-bin-ubuntu");
   } else {
-    arch_pos = base.find("-mac");
+    arch_pos = base.find("-bin-macos");
   }
 
   // Extract architecture part
@@ -202,6 +226,6 @@ inline std::pair<std::string, std::string> GetVersionAndArch(
   // Extract version part
   size_t v_pos = base.find_first_of('-');
   auto version = base.substr(v_pos + 1, arch_pos - v_pos - 1);
-  return std::pair("v" + version, arch);
+  return std::pair(version, string_utils::RemoveSubstring(arch, "bin-"));
 }
 }  // namespace engine_matcher_utils
diff --git a/engine/utils/github_release_utils.h b/engine/utils/github_release_utils.h
index 29f8a5725..84636903a 100644
--- a/engine/utils/github_release_utils.h
+++ b/engine/utils/github_release_utils.h
@@ -178,11 +178,6 @@ inline cpp::result<GitHubRelease, std::string> GetReleaseByVersion(
   std::vector<std::string> path_params{"repos", author, repo, "releases"};
   if (tag != "latest") {
     path_params.push_back("tags");
-
-    if (!string_utils::StartsWith(tag, "v")) {
-      path_params.push_back("v" + tag);
-    }
-
     path_params.push_back(tag);
   } else {
     path_params.push_back("latest");
diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc
index f63de5c5e..c9ccddfdf 100644
--- a/engine/utils/process/utils.cc
+++ b/engine/utils/process/utils.cc
@@ -347,7 +347,7 @@ bool KillProcess(ProcessInfo& proc_info) {
   bool success;
 
 #if defined(_WIN32)
-  success = TerminateJobObject(proc_info.hJob, 0) == 0;
+  success = TerminateJobObject(proc_info.hJob, 0);
 #elif defined(__APPLE__) || defined(__linux__)
   // we send SIGTERM to subprocess. we trust that this subprocess will
   // propagate SIGTERM correctly to its children processes.
diff --git a/engine/utils/string_utils.h b/engine/utils/string_utils.h
index a9ea756b3..e1a567942 100644
--- a/engine/utils/string_utils.h
+++ b/engine/utils/string_utils.h
@@ -22,6 +22,12 @@ inline std::string RTrim(const std::string& str) {
   return (end == std::string::npos) ? "" : str.substr(0, end + 1);
 }
 
+inline void LTrim(std::string& s) {
+  s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
+            return !std::isspace(ch);
+          }));
+};
+
 inline void Trim(std::string& s) {
   s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
             return !std::isspace(ch);
diff --git a/engine/utils/system_info_utils.h b/engine/utils/system_info_utils.h
index 54eaed8c9..9bef6f4f9 100644
--- a/engine/utils/system_info_utils.h
+++ b/engine/utils/system_info_utils.h
@@ -70,7 +70,7 @@ inline std::unique_ptr<SystemInfo> GetSystemInfo() {
 
 #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || \
     defined(__amd64) || defined(__x86_64) || defined(_M_AMD64)
-  arch << "amd64";
+  arch << "x64";
 #elif defined(__arm__) || defined(__arm) || defined(__arm64__) || \
     defined(__aarch64__) || defined(__thumb__) ||                 \
     defined(__TARGET_ARCH_ARM) || defined(__TARGET_ARCH_THUMB) || \