diff --git a/.github/patches/windows/msvcp140.dll b/.github/patches/windows/msvcp140.dll index f999742d9..d3d103ee0 100644 Binary files a/.github/patches/windows/msvcp140.dll and b/.github/patches/windows/msvcp140.dll differ diff --git a/.github/patches/windows/vcruntime140.dll b/.github/patches/windows/vcruntime140.dll index 3a4aded20..8edab904f 100644 Binary files a/.github/patches/windows/vcruntime140.dll and b/.github/patches/windows/vcruntime140.dll differ diff --git a/.github/patches/windows/vcruntime140_1.dll b/.github/patches/windows/vcruntime140_1.dll index 3ebabdee6..2ef481dbf 100644 Binary files a/.github/patches/windows/vcruntime140_1.dll and b/.github/patches/windows/vcruntime140_1.dll differ diff --git a/.github/workflows/beta-build.yml b/.github/workflows/beta-build.yml index 1bf324d96..1d5480312 100644 --- a/.github/workflows/beta-build.yml +++ b/.github/workflows/beta-build.yml @@ -9,7 +9,7 @@ jobs: get-update-version: uses: ./.github/workflows/template-get-update-version.yml - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml create-draft-release: @@ -39,7 +39,7 @@ jobs: build-macos: uses: ./.github/workflows/template-build-macos.yml - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] secrets: inherit with: ref: ${{ github.ref }} @@ -48,12 +48,12 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" channel: beta upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-windows-x64: uses: ./.github/workflows/template-build-windows-x64.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -64,12 +64,12 @@ jobs: ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' channel: beta upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-linux-x64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -78,13 +78,13 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: beta upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: amd64 build-linux-arm64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -93,13 +93,13 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: beta upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: arm64 build-docker-x64: uses: ./.github/workflows/template-build-docker-x64.yml secrets: inherit - needs: [get-update-version, get-cortex-llamacpp-latest-version] + needs: [get-update-version, get-llamacpp-latest-version] with: ref: ${{ github.ref }} new_version: ${{ needs.get-update-version.outputs.new_version }} diff --git a/.github/workflows/cortex-cpp-quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml index 279dd77d6..fc2d52b63 100644 --- a/.github/workflows/cortex-cpp-quality-gate.yml +++ b/.github/workflows/cortex-cpp-quality-gate.yml @@ -150,6 +150,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" > ~/.config/cortexcpp/.cortexrc echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.config/cortexcpp/.cortexrc # ./build/cortex @@ -177,6 +178,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "apiServerPort: 3928" > ~/.config/cortexcpp/.cortexrc echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" >> ~/.config/cortexcpp/.cortexrc echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.config/cortexcpp/.cortexrc @@ -456,6 +458,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.config/cortexcpp/.cortexrc # ./build/cortex cat ~/.config/cortexcpp/.cortexrc @@ -481,6 +484,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "apiServerPort: 3928" > ~/.config/cortexcpp/.cortexrc echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.config/cortexcpp/.cortexrc # ./build/cortex diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml index 1f076dc97..efdbfdf6f 100644 --- a/.github/workflows/nightly-build.yml +++ b/.github/workflows/nightly-build.yml @@ -43,12 +43,12 @@ jobs: get-update-version: uses: ./.github/workflows/template-get-update-version.yml - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml build-macos: uses: ./.github/workflows/template-build-macos.yml - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] secrets: inherit with: ref: ${{ needs.set-public-provider.outputs.ref }} @@ -56,12 +56,12 @@ jobs: new_version: ${{ needs.get-update-version.outputs.new_version }} cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-windows-x64: uses: ./.github/workflows/template-build-windows-x64.yml secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] with: ref: ${{ needs.set-public-provider.outputs.ref }} public_provider: ${{ needs.set-public-provider.outputs.public_provider }} @@ -71,12 +71,12 @@ jobs: build-deps-cmake-flags: "-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-linux-x64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] with: ref: ${{ needs.set-public-provider.outputs.ref }} public_provider: ${{ needs.set-public-provider.outputs.public_provider }} @@ -84,13 +84,13 @@ jobs: runs-on: ubuntu-20-04 cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: amd64 build-linux-arm64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] with: ref: ${{ needs.set-public-provider.outputs.ref }} public_provider: ${{ needs.set-public-provider.outputs.public_provider }} @@ -98,13 +98,13 @@ jobs: runs-on: ubuntu-2004-arm64 cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: arm64 update-latest-version: runs-on: ubuntu-latest if: needs.set-public-provider.outputs.public_provider == 'aws-s3' - needs: [get-update-version, set-public-provider, build-linux-x64, build-linux-arm64, build-macos, build-windows-x64, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, build-linux-x64, build-linux-arm64, build-macos, build-windows-x64, get-llamacpp-latest-version] steps: - name: Update latest version id: update-latest-version @@ -132,7 +132,7 @@ jobs: if: needs.set-public-provider.outputs.public_provider == 'aws-s3' uses: ./.github/workflows/template-build-docker-x64.yml secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version, update-latest-version] with: ref: ${{ needs.set-public-provider.outputs.ref }} new_version: nightly-${{ needs.get-update-version.outputs.new_version }} @@ -141,7 +141,7 @@ jobs: tags: menloltd/cortex:nightly-${{ needs.get-update-version.outputs.new_version }} noti-discord-nightly-and-update-url-readme: - needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version, build-docker-x64] + needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-llamacpp-latest-version, update-latest-version, build-docker-x64] secrets: inherit if: github.event_name == 'schedule' uses: ./.github/workflows/template-noti-discord.yaml @@ -150,7 +150,7 @@ jobs: new_version: ${{ needs.get-update-version.outputs.new_version }} noti-discord-manual: - needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, build-docker-x64] + needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-llamacpp-latest-version, build-docker-x64] secrets: inherit if: github.event_name == 'workflow_dispatch' && github.event.inputs.public_provider == 'aws-s3' uses: ./.github/workflows/template-noti-discord.yaml diff --git a/.github/workflows/stable-build.yml b/.github/workflows/stable-build.yml index b05df983d..c4b5f53f3 100644 --- a/.github/workflows/stable-build.yml +++ b/.github/workflows/stable-build.yml @@ -9,7 +9,7 @@ jobs: get-update-version: uses: ./.github/workflows/template-get-update-version.yml - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml create-draft-release: @@ -39,7 +39,7 @@ jobs: build-macos: uses: ./.github/workflows/template-build-macos.yml - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] secrets: inherit with: ref: ${{ github.ref }} @@ -48,12 +48,12 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" channel: stable upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-windows-x64: uses: ./.github/workflows/template-build-windows-x64.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -64,12 +64,12 @@ jobs: ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' channel: stable upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-linux-x64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -78,13 +78,13 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: stable upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: amd64 build-linux-arm64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -93,13 +93,13 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: stable upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: arm64 build-docker-x64: uses: ./.github/workflows/template-build-docker-x64.yml secrets: inherit - needs: [get-update-version, get-cortex-llamacpp-latest-version] + needs: [get-update-version, get-llamacpp-latest-version] with: ref: ${{ github.ref }} new_version: ${{ needs.get-update-version.outputs.new_version }} diff --git a/.github/workflows/template-build-linux.yml b/.github/workflows/template-build-linux.yml index 3fa802ad4..0ebd04176 100644 --- a/.github/workflows/template-build-linux.yml +++ b/.github/workflows/template-build-linux.yml @@ -44,7 +44,7 @@ on: type: string default: 'nightly' description: 'The channel to use for this job' - cortex-llamacpp-version: + llamacpp-version: required: true type: string default: '0.0.0' @@ -169,23 +169,23 @@ jobs: mkdir -p engine/templates/linux/dependencies cd engine/templates/linux/dependencies if [ "${{ inputs.arch }}" == "amd64" ]; then - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-11-7.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-12-0.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-11-7.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-12-0.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-11-7.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-12-0.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-11-7.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-12-0.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-vulkan.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-linux-amd64.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-linux-amd64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-cuda-cu11.7-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-cuda-cu12.0-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx2-cuda-cu11.7-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx2-cuda-cu12.0-x64.tar.gz + wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-x64.zip + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-cuda-cu11.7-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-cuda-cu12.0-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-cuda-cu11.7-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-cuda-cu12.0-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-x64.tar.gz + wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-vulkan-x64.zip + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-linux-cu11.7-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu12.0-x64.tar.gz else - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-arm64.tar.gz + wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-arm64.zip fi cd .. diff --git a/.github/workflows/template-build-macos.yml b/.github/workflows/template-build-macos.yml index 20c7430fb..ea96d2df6 100644 --- a/.github/workflows/template-build-macos.yml +++ b/.github/workflows/template-build-macos.yml @@ -39,7 +39,7 @@ on: type: string default: 'nightly' description: 'The channel to use for this job' - cortex-llamacpp-version: + llamacpp-version: required: true type: string default: '0.0.0' @@ -253,6 +253,14 @@ jobs: cd engine make codesign-binary CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}" DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" + - name: Code Signing binaries for separate binary + run: | + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_name }} + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_name }} + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + - name: Notary macOS Binary run: | curl -sSfL https://raw.githubusercontent.com/anchore/quill/main/install.sh | sh -s -- -b /usr/local/bin @@ -265,6 +273,18 @@ jobs: QUILL_NOTARY_ISSUER: ${{ secrets.NOTARY_ISSUER }} QUILL_NOTARY_KEY: "/tmp/notary-key.p8" + - name: Notary macOS Binary for separate binary + run: | + # Notarize the binary + quill notarize ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_name }} + quill notarize ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + quill notarize ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_name }} + quill notarize ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + env: + QUILL_NOTARY_KEY_ID: ${{ secrets.NOTARY_KEY_ID }} + QUILL_NOTARY_ISSUER: ${{ secrets.NOTARY_ISSUER }} + QUILL_NOTARY_KEY: "/tmp/notary-key.p8" + - name: Build network Installers shell: bash run: | @@ -289,8 +309,8 @@ jobs: run: | mkdir -p engine/templates/macos/Scripts/dependencies cd engine/templates/macos/Scripts/dependencies - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-arm64.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-amd64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-macos-arm64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-macos-x64.tar.gz cd ../../ chmod +x create_pkg_local.sh @@ -310,6 +330,24 @@ jobs: xcrun notarytool submit ${{ steps.set-output-params.outputs.package_name }}-local.pkg --apple-id ${{ secrets.APPLE_ID }} --password ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }} --team-id ${{ secrets.APPLE_TEAM_ID }} --wait - name: Package + run: | + mkdir temp + # Mac arm64 + mv cortex-${{ inputs.new_version }}-mac-arm64 temp/cortex + cd temp + tar -czvf cortex-arm64.tar.gz cortex + mv cortex-arm64.tar.gz ../cortex-arm64.tar.gz + cd .. + rm -rf temp/cortex + + # Mac amd64 + mv cortex-${{ inputs.new_version }}-mac-amd64 temp/cortex + cd temp + tar -czvf cortex-amd64.tar.gz cortex + mv cortex-amd64.tar.gz ../cortex-amd64.tar.gz + cd .. + + - name: Package for separate binary run: | cd engine make package @@ -320,6 +358,18 @@ jobs: name: cortex-${{ inputs.new_version }}-mac-universal path: ./engine/cortex + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-arm64-signed + path: ./cortex-${{ inputs.new_version }}-mac-arm64 + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-amd64-signed + path: ./cortex-${{ inputs.new_version }}-mac-amd64 + - name: Upload Artifact uses: actions/upload-artifact@v4 with: @@ -358,6 +408,28 @@ jobs: asset_name: cortex-${{ inputs.new_version }}-mac-universal.tar.gz asset_content_type: application/zip + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./cortex-arm64.tar.gz + asset_name: cortex-${{ inputs.new_version }}-mac-arm64.tar.gz + asset_content_type: application/zip + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./cortex-amd64.tar.gz + asset_name: cortex-${{ inputs.new_version }}-mac-amd64.tar.gz + asset_content_type: application/zip + - name: Upload release assert if public provider is github if: inputs.public_provider == 'github' env: diff --git a/.github/workflows/template-build-windows-x64.yml b/.github/workflows/template-build-windows-x64.yml index b9e0c9937..399e3dd3e 100644 --- a/.github/workflows/template-build-windows-x64.yml +++ b/.github/workflows/template-build-windows-x64.yml @@ -44,7 +44,7 @@ on: type: string default: 'nightly' description: 'The channel to use for this job' - cortex-llamacpp-version: + llamacpp-version: required: true type: string default: '0.0.0' @@ -205,21 +205,21 @@ jobs: run: | mkdir dependencies cd dependencies - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-11-7.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-12-0.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-11-7.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-12-0.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-11-7.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-12-0.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-11-7.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-12-0.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-vulkan.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-windows-amd64.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-windows-amd64.tar.gz + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-cuda-cu11.7-x64.tar.gz + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-cuda-cu12.0-x64.tar.gz + # wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-x64.zip + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-cuda-cu11.7-x64.tar.gz + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-cuda-cu12.0-x64.tar.gz + wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-x64.zip + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-cuda-cu11.7-x64.tar.gz + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-cuda-cu12.0-x64.tar.gz + # wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-x64.zip + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-cuda-cu11.7-x64.tar.gz + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-cuda-cu12.0-x64.tar.gz + wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-x64.zip + wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-vulkan-x64.zip + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu11.7-x64.tar.gz + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu12.0-x64.tar.gz - name: Enable long paths run: | diff --git a/.github/workflows/template-cortex-llamacpp-latest-version.yml b/.github/workflows/template-cortex-llamacpp-latest-version.yml index 610b1a89a..3d7b74e56 100644 --- a/.github/workflows/template-cortex-llamacpp-latest-version.yml +++ b/.github/workflows/template-cortex-llamacpp-latest-version.yml @@ -1,13 +1,13 @@ -name: get-cortex-llamacpp-latest-version +name: get-llamacpp-latest-version on: workflow_call: outputs: - cortex_llamacpp_latest_version: + llamacpp_latest_version: description: 'The latest version of cortex.llamacpp engines' - value: ${{ jobs.get-cortex-llamacpp-latest-version.outputs.new_version }} + value: ${{ jobs.get-llamacpp-latest-version.outputs.new_version }} jobs: - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: runs-on: ubuntu-latest outputs: new_version: ${{ steps.version_update.outputs.new_version }} @@ -24,7 +24,7 @@ jobs: local max_retries=3 local tag while [ $retries -lt $max_retries ]; do - tag=$(curl -s https://api.github.com/repos/menloresearch/cortex.llamacpp/releases/latest | jq -r .tag_name) + tag=$(curl -s https://api.github.com/repos/menloresearch/llama.cpp/releases/latest | jq -r .tag_name) if [ -n "$tag" ] && [ "$tag" != "null" ]; then echo $tag return diff --git a/docker/Dockerfile b/docker/Dockerfile index 744c3899c..5f04da12e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -24,7 +24,6 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ apt-get update && \ apt-get install -y --no-install-recommends \ - cmake \ make \ git \ uuid-dev \ @@ -37,11 +36,21 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul ninja-build \ pkg-config \ python3-pip \ - openssl && \ + openssl \ + libssl-dev && \ pip3 install awscli && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Download and install CMake 3.22.6 +RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.6/cmake-3.22.6.tar.gz -q -O /tmp/cmake.tar.gz && \ + tar -xzf /tmp/cmake.tar.gz -C /tmp && \ + cd /tmp/cmake-3.22.6 && \ + ./bootstrap && \ + make -j$(nproc) && \ + make install && \ + rm -rf /tmp/cmake.tar.gz /tmp/cmake-3.22.6 + ARG CORTEX_CPP_VERSION=latest ARG CMAKE_EXTRA_FLAGS="" diff --git a/docker/Dockerfile.cache b/docker/Dockerfile.cache index 0a9cbe02d..3eabc5dce 100644 --- a/docker/Dockerfile.cache +++ b/docker/Dockerfile.cache @@ -24,7 +24,6 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ apt-get update && \ apt-get install -y --no-install-recommends \ - cmake \ make \ git \ uuid-dev \ @@ -37,11 +36,21 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul ninja-build \ pkg-config \ python3-pip \ - openssl && \ + openssl \ + libssl-dev && \ pip3 install awscli && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Download and install CMake 3.22.6 +RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.6/cmake-3.22.6.tar.gz -q -O /tmp/cmake.tar.gz && \ + tar -xzf /tmp/cmake.tar.gz -C /tmp && \ + cd /tmp/cmake-3.22.6 && \ + ./bootstrap && \ + make -j$(nproc) && \ + make install && \ + rm -rf /tmp/cmake.tar.gz /tmp/cmake-3.22.6 + ARG CORTEX_CPP_VERSION=latest ARG CMAKE_EXTRA_FLAGS="" diff --git a/docs/docs/engines/engine-extension.mdx b/docs/docs/engines/engine-extension.mdx index d2edde830..8b550c5a4 100644 --- a/docs/docs/engines/engine-extension.mdx +++ b/docs/docs/engines/engine-extension.mdx @@ -71,9 +71,6 @@ class EngineI { std::shared_ptr json_body, std::function&& callback) = 0; - // Compatibility and model management - virtual bool IsSupported(const std::string& f) = 0; - virtual void GetModels( std::shared_ptr jsonBody, std::function&& callback) = 0; diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json index 23970ef51..b7d628094 100644 --- a/docs/static/openapi/cortex.json +++ b/docs/static/openapi/cortex.json @@ -2754,7 +2754,7 @@ }, "version": { "type": "string", - "example": "0.1.35-28.10.24" + "example": "b4920" } } } @@ -2763,11 +2763,11 @@ { "engine": "llama-cpp", "name": "mac-arm64", - "version": "0.1.35-28.10.24" + "version": "b4920" }, { "engine": "llama-cpp", - "name": "linux-amd64-avx", + "name": "linux-avx-x64", "version": "0.1.35-27.10.24" } ] @@ -2901,7 +2901,7 @@ "name": { "type": "string", "description": "The name of the variant, including OS, architecture, and capabilities", - "example": "linux-amd64-avx-cuda-11-7" + "example": "linux-avx-x64-cuda-11-7" }, "created_at": { "type": "string", @@ -2973,7 +2973,7 @@ }, "name": { "type": "string", - "example": "0.1.39-linux-amd64-avx-cuda-11-7" + "example": "llama-b4920-bin-linux-avx-cuda-cu11.7" }, "size": { "type": "integer", @@ -3250,7 +3250,7 @@ }, "version": { "type": "string", - "example": "0.1.35-28.10.24" + "example": "b4920" } } } diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt index f7a20b58b..39052b08e 100644 --- a/engine/CMakeLists.txt +++ b/engine/CMakeLists.txt @@ -182,6 +182,7 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/extensions/remote-engine/remote_engine.cc + ${CMAKE_CURRENT_SOURCE_DIR}/extensions/local-engine/local_engine.cc ) @@ -227,3 +228,12 @@ set_target_properties(${TARGET_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR} RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} ) + +if(MSVC) + add_custom_command( + TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_CURRENT_SOURCE_DIR}/../.github/patches/windows + ${CMAKE_BINARY_DIR}/ + ) +endif() \ No newline at end of file diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt index 4163042d0..bb18433fe 100644 --- a/engine/cli/CMakeLists.txt +++ b/engine/cli/CMakeLists.txt @@ -73,7 +73,7 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/../services/hardware_service.cc ${CMAKE_CURRENT_SOURCE_DIR}/../services/database_service.cc ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/remote-engine/remote_engine.cc - + ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/local-engine/local_engine.cc ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/template_renderer.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/easywsclient.cc diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc index 99f51983e..aa0b9aab4 100644 --- a/engine/cli/command_line_parser.cc +++ b/engine/cli/command_line_parser.cc @@ -33,6 +33,7 @@ #include "services/engine_service.h" #include "utils/file_manager_utils.h" #include "utils/logging_utils.h" +#include "utils/task_queue.h" namespace { constexpr const auto kCommonCommandsGroup = "Common Commands"; @@ -50,8 +51,7 @@ CommandLineParser::CommandLineParser() download_service_{std::make_shared()}, dylib_path_manager_{std::make_shared()}, db_service_{std::make_shared()}, - engine_service_{std::make_shared( - download_service_, dylib_path_manager_, db_service_)} {} + engine_service_{std::make_shared(dylib_path_manager_)} {} bool CommandLineParser::SetupCommand(int argc, char** argv) { app_.usage("Usage:\n" + commands::GetCortexBinary() + diff --git a/engine/cli/commands/cortex_upd_cmd.cc b/engine/cli/commands/cortex_upd_cmd.cc index e11ad4290..33a51ed53 100644 --- a/engine/cli/commands/cortex_upd_cmd.cc +++ b/engine/cli/commands/cortex_upd_cmd.cc @@ -532,10 +532,10 @@ bool CortexUpdCmd::GetLinuxInstallScript(const std::string& v, const std::string& channel) { std::vector path_list; if (channel == "nightly") { - path_list = {"menloresearch", "cortex.cpp", "dev", "engine", + path_list = {kMenloOrg, "cortex.cpp", "dev", "engine", "templates", "linux", "install.sh"}; } else { - path_list = {"menloresearch", "cortex.cpp", "main", "engine", + path_list = {kMenloOrg, "cortex.cpp", "main", "engine", "templates", "linux", "install.sh"}; } auto url_obj = url_parser::Url{ diff --git a/engine/cli/commands/cortex_upd_cmd.h b/engine/cli/commands/cortex_upd_cmd.h index 7f02839cf..fdee6cc49 100644 --- a/engine/cli/commands/cortex_upd_cmd.h +++ b/engine/cli/commands/cortex_upd_cmd.h @@ -79,9 +79,9 @@ inline std::vector GetReleasePath() { if (CORTEX_VARIANT == file_manager_utils::kNightlyVariant) { return {"cortex", "latest", "version.json"}; } else if (CORTEX_VARIANT == file_manager_utils::kBetaVariant) { - return {"repos", "menloresearch", "cortex.cpp", "releases"}; + return {"repos", kMenloOrg, "cortex.cpp", "releases"}; } else { - return {"repos", "menloresearch", "cortex.cpp", "releases", "latest"}; + return {"repos", kMenloOrg, "cortex.cpp", "releases", "latest"}; } } diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc index bebfdb8ce..b31aecaa6 100644 --- a/engine/cli/commands/engine_install_cmd.cc +++ b/engine/cli/commands/engine_install_cmd.cc @@ -92,7 +92,10 @@ bool EngineInstallCmd::Exec(const std::string& engine, std::vector variant_selections; for (const auto& variant : variant_result.value()) { auto v_name = variant["name"].asString(); - if (string_utils::StringContainsIgnoreCase(v_name, hw_inf_.sys_inf->os) && + if ((string_utils::StringContainsIgnoreCase(v_name, + hw_inf_.sys_inf->os) || + (hw_inf_.sys_inf->os == kLinuxOs && + string_utils::StringContainsIgnoreCase(v_name, kUbuntuOs))) && string_utils::StringContainsIgnoreCase(v_name, hw_inf_.sys_inf->arch)) { variant_selections.push_back(variant["name"].asString()); diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc index af2d647e2..e074ee18a 100644 --- a/engine/cli/commands/server_start_cmd.cc +++ b/engine/cli/commands/server_start_cmd.cc @@ -106,10 +106,8 @@ bool ServerStartCmd::Exec(const std::string& host, int port, #else std::vector commands; // Some engines requires to add lib search path before process being created - auto download_srv = std::make_shared(); - auto dylib_path_mng = std::make_shared(); - auto db_srv = std::make_shared(); - EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath(); + EngineService(std::make_shared()) + .RegisterEngineLibPath(); std::string p = cortex_utils::GetCurrentPath() + "/" + exe; commands.push_back(p); diff --git a/engine/cli/main.cc b/engine/cli/main.cc index a4e6c38cc..1fa45d6fd 100644 --- a/engine/cli/main.cc +++ b/engine/cli/main.cc @@ -155,7 +155,7 @@ int main(int argc, char* argv[]) { auto get_latest_version = []() -> cpp::result { try { auto res = github_release_utils::GetReleaseByVersion( - "menloresearch", "cortex.llamacpp", "latest"); + kGgmlOrg, kLlamaRepo, "latest"); if (res.has_error()) { CTL_ERR("Failed to get latest llama.cpp version: " << res.error()); return cpp::fail("Failed to get latest llama.cpp version: " + diff --git a/engine/cli/utils/download_progress.cc b/engine/cli/utils/download_progress.cc index 7538fff46..32cc6e20a 100644 --- a/engine/cli/utils/download_progress.cc +++ b/engine/cli/utils/download_progress.cc @@ -83,8 +83,8 @@ bool DownloadProgress::Handle( size_t max_length = 20) -> std::string { // Check the length of the input string if (str.length() >= max_length) { - return str.substr( - 0, max_length); // Return truncated string if it's too long + return str.substr(0, max_length - 3) + + ".. "; // Return truncated string if it's too long } // Calculate the number of spaces needed diff --git a/engine/controllers/engines.cc b/engine/controllers/engines.cc index f7deb41eb..2a9427abf 100644 --- a/engine/controllers/engines.cc +++ b/engine/controllers/engines.cc @@ -155,6 +155,7 @@ void Engines::GetEngineVariants( releases.append(json.value()); } } + CTL_INF(releases.toStyledString()); auto resp = cortex_utils::CreateCortexHttpJsonResponse(releases); resp->setStatusCode(k200OK); callback(resp); @@ -177,6 +178,8 @@ void Engines::InstallEngine( } norm_version = version; } + CTL_INF("version: " << norm_version + << ", norm_variant: " << norm_variant.value_or("")); auto result = engine_service_->InstallEngineAsync(engine, norm_version, norm_variant); diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc index 079b69423..6ea733a70 100644 --- a/engine/controllers/server.cc +++ b/engine/controllers/server.cc @@ -138,7 +138,7 @@ void server::ProcessStreamRes(std::function cb, auto err_or_done = std::make_shared(false); auto chunked_content_provider = [this, q, err_or_done, engine_type, model_id]( char* buf, - std::size_t buf_size) -> std::size_t { + std::size_t buf_size) -> std::size_t { if (buf == nullptr) { LOG_TRACE << "Buf is null"; if (!(*err_or_done)) { diff --git a/engine/cortex-common/EngineI.h b/engine/cortex-common/EngineI.h index b796ebaed..2518b0ce5 100644 --- a/engine/cortex-common/EngineI.h +++ b/engine/cortex-common/EngineI.h @@ -47,9 +47,6 @@ class EngineI { std::shared_ptr json_body, std::function&& callback) = 0; - // For backward compatible checking - virtual bool IsSupported(const std::string& f) = 0; - // Get list of running models virtual void GetModels( std::shared_ptr jsonBody, diff --git a/engine/cortex-common/remote_enginei.h b/engine/cortex-common/remote_enginei.h index 835f526a0..163490cdc 100644 --- a/engine/cortex-common/remote_enginei.h +++ b/engine/cortex-common/remote_enginei.h @@ -1,7 +1,5 @@ #pragma once -#pragma once - #include #include diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py index 7356ef904..842ef2c35 100644 --- a/engine/e2e-test/api/engines/test_api_engine.py +++ b/engine/e2e-test/api/engines/test_api_engine.py @@ -28,14 +28,14 @@ def test_engines_get_llamacpp_should_be_successful(self): # engines install def test_engines_install_llamacpp_specific_version_and_variant(self): - data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx"} + data = {"version": "b4932", "variant": "linux-avx-x64"} response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) assert response.status_code == 200 def test_engines_install_llamacpp_specific_version_and_null_variant(self): - data = {"version": "v0.1.40-b4354"} + data = {"version": "b4932"} response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) @@ -55,14 +55,14 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self): @pytest.mark.asyncio async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self): # install first - data = {"variant": "mac-arm64"} + data = {"variant": "linux-avx-x64"} install_response = requests.post( "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data ) await wait_for_websocket_download_success_event(timeout=120) assert install_response.status_code == 200 - data = {"version": "v0.1.35"} + data = {"version": "b4932"} response = requests.delete( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) @@ -72,7 +72,7 @@ async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_fa @pytest.mark.asyncio async def test_engines_install_uninstall_llamacpp_with_variant_should_be_successful(self): # install first - data = {"variant": "mac-arm64"} + data = {"variant": "linux-avx-x64"} install_response = requests.post( "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data ) @@ -85,7 +85,7 @@ async def test_engines_install_uninstall_llamacpp_with_variant_should_be_success def test_engines_install_uninstall_llamacpp_with_specific_variant_and_version_should_be_successful( self, ): - data = {"variant": "mac-arm64", "version": "v0.1.35"} + data = {"variant": "linux-avx-x64", "version": "b4932"} # install first install_response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py index e92afb14b..088cc2474 100644 --- a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py +++ b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py @@ -2,7 +2,7 @@ import requests from utils.test_runner import start_server, stop_server, get_latest_pre_release_tag -latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "cortex.llamacpp") +latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "llama.cpp") class TestApiEngineInstall: @@ -23,7 +23,7 @@ def test_engines_install_llamacpp_should_be_successful(self): assert response.status_code == 200 def test_engines_install_llamacpp_specific_version_and_variant(self): - data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx"} + data = {"version": latest_pre_release_tag, "variant": "linux-avx-x64"} response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) diff --git a/engine/e2e-test/api/engines/test_api_get_default_engine.py b/engine/e2e-test/api/engines/test_api_get_default_engine.py index 2dfc467a3..f0566128c 100644 --- a/engine/e2e-test/api/engines/test_api_get_default_engine.py +++ b/engine/e2e-test/api/engines/test_api_get_default_engine.py @@ -24,8 +24,8 @@ def setup_and_teardown(self): def test_api_get_default_engine_successfully(self): # Data test engine= "llama-cpp" - name= "linux-amd64-avx" - version= "v0.1.35-27.10.24" + name= "linux-avx-x64" + version= "b4932" data = {"version": version, "variant": name} post_install_url = f"http://localhost:3928/v1/engines/{engine}/install" diff --git a/engine/e2e-test/api/engines/test_api_get_list_engine.py b/engine/e2e-test/api/engines/test_api_get_list_engine.py index e6baa22a6..38cb45b39 100644 --- a/engine/e2e-test/api/engines/test_api_get_list_engine.py +++ b/engine/e2e-test/api/engines/test_api_get_list_engine.py @@ -24,8 +24,8 @@ def setup_and_teardown(self): def test_api_get_list_engines_successfully(self): # Data test engine= "llama-cpp" - name= "linux-amd64-avx" - version= "v0.1.35-27.10.24" + name= "linux-avx-x64" + version= "b4932" post_install_url = f"http://localhost:3928/v1/engines/{engine}/install" response = requests.delete( diff --git a/engine/e2e-test/api/engines/test_api_post_default_engine.py b/engine/e2e-test/api/engines/test_api_post_default_engine.py index b2b4e4c48..cede78485 100644 --- a/engine/e2e-test/api/engines/test_api_post_default_engine.py +++ b/engine/e2e-test/api/engines/test_api_post_default_engine.py @@ -23,8 +23,8 @@ def setup_and_teardown(self): def test_api_set_default_engine_successfully(self): # Data test engine= "llama-cpp" - name= "linux-amd64-avx" - version= "v0.1.35-27.10.24" + name= "linux-avx-x64" + version= "b4932" data = {"version": version, "variant": name} post_install_url = f"http://localhost:3928/v1/engines/{engine}/install" diff --git a/engine/e2e-test/api/files/test_api_create_file.py b/engine/e2e-test/api/files/test_api_create_file.py index 7c7226f50..03525672d 100644 --- a/engine/e2e-test/api/files/test_api_create_file.py +++ b/engine/e2e-test/api/files/test_api_create_file.py @@ -23,7 +23,6 @@ def setup_and_teardown(self): # Teardown stop_server() - @pytest.mark.skipif(platform.system() != "Linux", reason="Todo: fix later on Mac and Window") def test_api_create_file_successfully(self): # Define file path file_path_rel = os.path.join("e2e-test", "api", "files", "blank.txt") diff --git a/engine/e2e-test/api/hardware/test_api_get_hardware.py b/engine/e2e-test/api/hardware/test_api_get_hardware.py index 59b15ac18..0efecdbdc 100644 --- a/engine/e2e-test/api/hardware/test_api_get_hardware.py +++ b/engine/e2e-test/api/hardware/test_api_get_hardware.py @@ -88,25 +88,6 @@ def test_api_get_hardware_successfully(self): "example": True, "description": "Indicates if the GPU is currently activated." }, - "additional_information": { - "type": "object", - "properties": { - "compute_cap": { - "type": "string", - "example": "8.6", - "description": "The compute capability of the GPU." - }, - "driver_version": { - "type": "string", - "example": "535.183", - "description": "The version of the installed driver." - } - }, - "required": [ - "compute_cap", - "driver_version" - ] - }, "free_vram": { "type": "integer", "example": 23983, @@ -140,7 +121,6 @@ def test_api_get_hardware_successfully(self): }, "required": [ "activated", - "additional_information", "free_vram", "id", "name", diff --git a/engine/e2e-test/api/model/test_api_model.py b/engine/e2e-test/api/model/test_api_model.py index bacf7e1b0..f370b1daa 100644 --- a/engine/e2e-test/api/model/test_api_model.py +++ b/engine/e2e-test/api/model/test_api_model.py @@ -1,6 +1,7 @@ import pytest import requests import time +import platform from utils.test_runner import ( run, start_server, @@ -95,6 +96,7 @@ async def test_models_start_stop_should_be_successful(self): time.sleep(30) print("Pull model") + requests.delete("http://localhost:3928/v1/models/tinyllama:1b") json_body = {"model": "tinyllama:1b"} response = requests.post("http://localhost:3928/v1/models/pull", json=json_body) assert response.status_code == 200, f"Failed to pull model: tinyllama:1b" @@ -110,16 +112,18 @@ async def test_models_start_stop_should_be_successful(self): response = requests.get("http://localhost:3928/v1/models") assert response.status_code == 200 - print("Start model") - json_body = {"model": "tinyllama:1b"} - response = requests.post( - "http://localhost:3928/v1/models/start", json=json_body - ) - assert response.status_code == 200, f"status_code: {response.status_code}" + # Skip tests for linux arm + if platform.machine() != "aarch64": + print("Start model") + json_body = {"model": "tinyllama:1b"} + response = requests.post( + "http://localhost:3928/v1/models/start", json=json_body + ) + assert response.status_code == 200, f"status_code: {response.status_code}" - print("Stop model") - response = requests.post("http://localhost:3928/v1/models/stop", json=json_body) - assert response.status_code == 200, f"status_code: {response.status_code}" + print("Stop model") + response = requests.post("http://localhost:3928/v1/models/stop", json=json_body) + assert response.status_code == 200, f"status_code: {response.status_code}" # update API print("Update model") diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py index 370ebe3f3..5d520ce8b 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_install.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_install.py @@ -31,25 +31,9 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(reason="Ignore onnx-runtime test") - def test_engines_install_onnx_on_macos_should_be_failed(self): - exit_code, output, error = run( - "Install Engine", ["engines", "install", "onnxruntime"] - ) - assert "is not supported on" in output, "Should display error message" - assert exit_code == 0, f"Install engine failed with error: {error}" - - @pytest.mark.skipif(reason="Ignore tensorrt-llm test") - def test_engines_install_onnx_on_tensorrt_should_be_failed(self): - exit_code, output, error = run( - "Install Engine", ["engines", "install", "tensorrt-llm"] - ) - assert "is not supported on" in output, "Should display error message" - assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(platform.system() == "Windows", reason="Progress bar log issue on Windows") def test_engines_install_pre_release_llamacpp(self): - engine_version = "v0.1.43" + engine_version = "b4932" exit_code, output, error = run( "Install Engine", ["engines", "install", "llama-cpp", "-v", engine_version], diff --git a/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py b/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py index 9fc296d60..ea3cae242 100644 --- a/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py +++ b/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py @@ -21,7 +21,7 @@ from api.engines.test_api_get_default_engine import TestApiDefaultEngine from api.engines.test_api_get_engine_release import TestApiEngineRelease from api.engines.test_api_get_engine_release_latest import TestApiEngineReleaseLatest -from test_api_post_default_engine import TestApiSetDefaultEngine +from api.engines.test_api_post_default_engine import TestApiSetDefaultEngine from api.model.test_api_model import TestApiModel from api.model.test_api_model_import import TestApiModelImport from api.files.test_api_create_file import TestApiCreateFile diff --git a/engine/e2e-test/runner/main.py b/engine/e2e-test/runner/main.py index 49bdc5131..8a98d0ca3 100644 --- a/engine/e2e-test/runner/main.py +++ b/engine/e2e-test/runner/main.py @@ -21,7 +21,7 @@ from api.engines.test_api_get_default_engine import TestApiDefaultEngine from api.engines.test_api_get_engine_release import TestApiEngineRelease from api.engines.test_api_get_engine_release_latest import TestApiEngineReleaseLatest -from test_api_post_default_engine import TestApiSetDefaultEngine +from api.engines.test_api_post_default_engine import TestApiSetDefaultEngine from api.model.test_api_model import TestApiModel from api.model.test_api_model_import import TestApiModelImport from api.files.test_api_create_file import TestApiCreateFile diff --git a/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py b/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py index 7a3c2e232..a22000d93 100644 --- a/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py +++ b/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py @@ -125,7 +125,7 @@ async def test_models_on_cortexso_hub(self, model_url): "Install Engine", ["engines", "install", "llama-cpp"], timeout=None, capture = False ) root = Path.home() - assert os.path.exists(root / "cortexcpp" / "engines" / "cortex.llamacpp" / "version.txt") + assert os.path.exists(root / "cortexcpp" / "engines" / "llama.cpp" / "version.txt") assert exit_code == 0, f"Install engine failed with error: {error}" # Start the model diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc new file mode 100644 index 000000000..885c14d77 --- /dev/null +++ b/engine/extensions/local-engine/local_engine.cc @@ -0,0 +1,1035 @@ +#include "local_engine.h" +#include +#include +#include +#include "utils/curl_utils.h" +#include "utils/json_helper.h" +#include "utils/logging_utils.h" +#include "utils/process/utils.h" +#include "utils/url_parser.h" + +namespace cortex::local { + +namespace { +const std::unordered_set kIgnoredParams = { + "model", "model_alias", "embedding", "ai_prompt", + "ai_template", "prompt_template", "mmproj", "system_prompt", + "created", "stream", "name", "os", + "owned_by", "files", "gpu_arch", "quantization_method", + "engine", "system_template", "max_tokens", "user_template", + "user_prompt", "min_keep", "mirostat", "mirostat_eta", + "mirostat_tau", "text_model", "version", "n_probs", + "object", "penalize_nl", "precision", "size", + "stop", "tfs_z", "typ_p"}; + +const std::unordered_map kParamsMap = { + {"cpu_threads", "--threads"}, + {"n_ubatch", "--ubatch-size"}, + {"n_batch", "--batch-size"}, + {"n_parallel", "--parallel"}, + {"temperature", "--temp"}, + {"top_k", "--top-k"}, + {"top_p", "--top-p"}, + {"min_p", "--min-p"}, + {"dynatemp_exponent", "--dynatemp-exp"}, + {"ctx_len", "--ctx-size"}, + {"ngl", "-ngl"}, +}; + +int GenerateRandomInteger(int min, int max) { + static std::random_device rd; // Seed for the random number engine + static std::mt19937 gen(rd()); // Mersenne Twister random number engine + std::uniform_int_distribution<> dis( + min, max); // Distribution for the desired range + + return dis(gen); // Generate and return a random integer within the range +} + +std::vector ConvertJsonToParamsVector(const Json::Value& root) { + std::vector res; + std::string errors; + + for (const auto& member : root.getMemberNames()) { + if (member == "model_path" || member == "llama_model_path") { + if (!root[member].isNull()) { + res.push_back("--model"); + res.push_back(root[member].asString()); + } + continue; + } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { + continue; + } else if (kParamsMap.find(member) != kParamsMap.end()) { + res.push_back(kParamsMap.at(member)); + res.push_back(root[member].asString()); + continue; + } else if (member == "model_type") { + if (root[member].asString() == "embedding") { + res.push_back("--embedding"); + } + continue; + } + + res.push_back("--" + member); + if (root[member].isString()) { + res.push_back(root[member].asString()); + } else if (root[member].isInt()) { + res.push_back(std::to_string(root[member].asInt())); + } else if (root[member].isDouble()) { + res.push_back(std::to_string(root[member].asDouble())); + } else if (root[member].isArray()) { + std::stringstream ss; + ss << "["; + bool first = true; + for (const auto& value : root[member]) { + if (!first) { + ss << ", "; + } + ss << "\"" << value.asString() << "\""; + first = false; + } + ss << "] "; + res.push_back(ss.str()); + } + } + + return res; +} + +constexpr const auto kMinDataChunkSize = 6u; + +struct OaiInfo { + std::string model; + bool include_usage = false; + bool oai_endpoint = false; + int n_probs = 0; +}; + +struct StreamingCallback { + std::shared_ptr callback; + bool need_stop = true; + OaiInfo oi; +}; + +struct Usage { + int prompt_tokens = 0; + int completion_tokens = 0; +}; + +std::string GenerateRandomString(std::size_t length) { + const std::string characters = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + std::random_device rd; + std::mt19937 generator(rd()); + + std::uniform_int_distribution<> distribution( + 0, static_cast(characters.size()) - 1); + + std::string random_string(length, '\0'); + std::generate_n(random_string.begin(), length, + [&]() { return characters[distribution(generator)]; }); + + return random_string; +} + +std::vector GetUTF8Bytes(const std::string& str) { + std::vector bytes; + for (unsigned char c : str) { + bytes.push_back(static_cast(c)); + } + return bytes; +} + +Json::Value TransformLogProbs(const Json::Value& logprobs) { + Json::Value root; + Json::Value logprobs_json(Json::arrayValue); + + // Iterate through each token group in the input + for (const auto& token_group : logprobs) { + Json::Value content_item; + + // Set the token (content) + content_item["token"] = token_group["content"].asString(); + + // Get the probabilities array + const auto& probs = token_group["probs"]; + + // Set the main token's logprob (first probability) + if (!probs.empty()) { + content_item["logprob"] = std::log( + probs[0]["prob"].asDouble() + std::numeric_limits::epsilon()); + } + + // Get UTF-8 bytes for the token + auto bytes = GetUTF8Bytes(token_group["content"].asString()); + Json::Value bytes_array(Json::arrayValue); + for (int byte : bytes) { + bytes_array.append(byte); + } + content_item["bytes"] = bytes_array; + + // Create top_logprobs array + Json::Value top_logprobs(Json::arrayValue); + for (const auto& prob_item : probs) { + Json::Value logprob_item; + logprob_item["token"] = prob_item["tok_str"].asString(); + logprob_item["logprob"] = + std::log(prob_item["prob"].asDouble() + + std::numeric_limits::epsilon()); + + // Get UTF-8 bytes for this alternative token + auto alt_bytes = GetUTF8Bytes(prob_item["tok_str"].asString()); + Json::Value alt_bytes_array(Json::arrayValue); + for (int byte : alt_bytes) { + alt_bytes_array.append(byte); + } + logprob_item["bytes"] = alt_bytes_array; + + top_logprobs.append(logprob_item); + } + content_item["top_logprobs"] = top_logprobs; + + logprobs_json.append(content_item); + } + root["content"] = logprobs_json; + return root; +} + +std::string CreateReturnJson( + const std::string& id, const std::string& model, const std::string& content, + Json::Value finish_reason, bool include_usage, + std::optional usage = std::nullopt, + std::optional logprobs = std::nullopt) { + Json::Value root; + + root["id"] = id; + root["model"] = model; + root["created"] = static_cast(std::time(nullptr)); + root["object"] = "chat.completion.chunk"; + + Json::Value choicesArray(Json::arrayValue); + // If usage, the choices field will always be an empty array + if (!usage) { + Json::Value choice; + + choice["index"] = 0; + Json::Value delta; + delta["content"] = content; + delta["role"] = "assistant"; + choice["delta"] = delta; + choice["finish_reason"] = finish_reason; + if (logprobs.has_value() && !logprobs.value().empty()) { + choice["logprobs"] = TransformLogProbs(logprobs.value()); + } + + choicesArray.append(choice); + } + root["choices"] = choicesArray; + if (include_usage) { + if (usage) { + Json::Value usage_json; + Json::Value details; + details["reasoning_tokens"] = 0; + usage_json["prompt_tokens"] = (*usage).prompt_tokens; + usage_json["completion_tokens"] = (*usage).completion_tokens; + usage_json["total_tokens"] = + (*usage).prompt_tokens + (*usage).completion_tokens; + usage_json["completion_tokens_details"] = details; + root["usage"] = usage_json; + } else { + root["usage"] = Json::Value(); + } + } + + Json::StreamWriterBuilder writer; + writer["indentation"] = ""; // This sets the indentation to an empty string, + // producing compact output. + return Json::writeString(writer, root); +} + +size_t WriteCallback(char* ptr, size_t size, size_t nmemb, void* userdata) { + auto* sc = static_cast(userdata); + size_t data_length = size * nmemb; + + if (ptr && data_length > kMinDataChunkSize) { + std::string chunk(ptr + kMinDataChunkSize, data_length - kMinDataChunkSize); + CTL_DBG(chunk); + if (sc->oi.oai_endpoint) { + if (chunk.find("[DONE]") != std::string::npos) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + Json::Value chunk_json; + chunk_json["data"] = "data: [DONE]"; + sc->need_stop = false; + (*sc->callback)(std::move(status), std::move(chunk_json)); + return data_length; + } + if (!sc->oi.include_usage && + chunk.find("completion_tokens") != std::string::npos) { + return data_length; + } + + Json::Value chunk_json; + chunk_json["data"] = "data: " + chunk; + Json::Value status; + status["is_done"] = false; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc->callback)(std::move(status), std::move(chunk_json)); + } else { + if (chunk.find("[DONE]") != std::string::npos) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + Json::Value chunk_json; + chunk_json["data"] = "data: [DONE]"; + sc->need_stop = false; + (*sc->callback)(std::move(status), std::move(chunk_json)); + return data_length; + } + auto json_data = json_helper::ParseJsonString(chunk); + // DONE + if (!json_data.isNull() && json_data.isMember("timings")) { + std::optional u; + if (sc->oi.include_usage) { + u = Usage{json_data["tokens_evaluated"].asInt(), + json_data["tokens_predicted"].asInt()}; + } + + Json::Value chunk_json; + chunk_json["data"] = + "data: " + CreateReturnJson(GenerateRandomString(20), sc->oi.model, + "", "stop", sc->oi.include_usage, u); + Json::Value status; + status["is_done"] = false; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc->callback)(std::move(status), std::move(chunk_json)); + + sc->need_stop = false; + return data_length; + } + + Json::Value logprobs; + if (sc->oi.n_probs > 0) { + logprobs = json_data["completion_probabilities"]; + } + std::string to_send; + if (json_data.isMember("choices") && json_data["choices"].isArray() && + json_data["choices"].size() > 0) { + to_send = json_data["choices"][0].get("text", "").asString(); + } + CTL_DBG(to_send); + const std::string str = + CreateReturnJson(GenerateRandomString(20), sc->oi.model, to_send, "", + sc->oi.include_usage, std::nullopt, logprobs); + Json::Value chunk_json; + chunk_json["data"] = "data: " + str; + Json::Value status; + status["is_done"] = false; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc->callback)(std::move(status), std::move(chunk_json)); + return data_length; + } + } + + return data_length; +} + +Json::Value ConvertLogitBiasToArray(const Json::Value& input) { + Json::Value result(Json::arrayValue); + if (input.isObject()) { + const auto& member_names = input.getMemberNames(); + for (const auto& tokenStr : member_names) { + Json::Value pair(Json::arrayValue); + pair.append(std::stoi(tokenStr)); + pair.append(input[tokenStr].asFloat()); + result.append(pair); + } + } + return result; +} + +Json::Value CreateFullReturnJson( + const std::string& id, const std::string& model, const std::string& content, + const std::string& system_fingerprint, int prompt_tokens, + int completion_tokens, Json::Value finish_reason = Json::Value(), + std::optional logprobs = std::nullopt) { + Json::Value root; + + root["id"] = id; + root["model"] = model; + root["created"] = static_cast(std::time(nullptr)); + root["object"] = "chat.completion"; + root["system_fingerprint"] = system_fingerprint; + + Json::Value choicesArray(Json::arrayValue); + Json::Value choice; + + choice["index"] = 0; + Json::Value message; + message["role"] = "assistant"; + message["content"] = content; + choice["message"] = message; + choice["finish_reason"] = finish_reason; + if (logprobs.has_value() && !logprobs.value().empty()) { + choice["logprobs"] = TransformLogProbs(logprobs.value()); + } + + choicesArray.append(choice); + root["choices"] = choicesArray; + + Json::Value usage; + usage["prompt_tokens"] = prompt_tokens; + usage["completion_tokens"] = completion_tokens; + usage["total_tokens"] = prompt_tokens + completion_tokens; + root["usage"] = usage; + + return root; +} + +} // namespace + +LocalEngine::~LocalEngine() { + for (auto& [_, si] : server_map_) { + (void)cortex::process::KillProcess(si.process_info); + } + server_map_.clear(); +} +void LocalEngine::HandleChatCompletion(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + auto& s = server_map_[model_id]; + auto oaicompat = [&json_body]() -> bool { + if (json_body->isMember("logprobs") && + (*json_body)["logprobs"].asBool()) { + return false; + } + return true; + }(); + if (oaicompat) { + HandleOpenAiChatCompletion( + json_body, const_cast(callback), model_id); + } else { + HandleNonOpenAiChatCompletion( + json_body, const_cast(callback), model_id); + } + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::HandleEmbedding(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + auto& s = server_map_[model_id]; + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ s.host + ":" + std::to_string(s.port), + /*.pathParams*/ {"v1", "embeddings"}, + /* .queries = */ {}, + }; + + auto response = curl_utils::SimplePostJson(url.ToFullPath(), + json_body->toStyledString()); + + if (response.has_error()) { + CTL_WRN("Error: " << response.error()); + Json::Value error; + error["error"] = response.error(); + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } else { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response.value())); + } + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::LoadModel(std::shared_ptr json_body, + http_callback&& callback) { + CTL_INF("Start loading model"); + auto wait_for_server_up = [this](const std::string& model, + const std::string& host, int port) { + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ host + ":" + std::to_string(port), + /*.pathParams*/ {"health"}, + /*.queries*/ {}, + }; + while (server_map_.find(model) != server_map_.end()) { + auto res = curl_utils::SimpleGet(url.ToFullPath()); + if (res.has_error()) { + LOG_INFO << "Wait for server up .."; + std::this_thread::sleep_for(std::chrono::seconds(1)); + } else { + return true; + } + } + return false; + }; + + LOG_DEBUG << "Start to spawn llama-server"; + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + server_map_[model_id].host = "127.0.0.1"; + server_map_[model_id].port = GenerateRandomInteger(39400, 39999); + auto& s = server_map_[model_id]; + s.pre_prompt = json_body->get("pre_prompt", "").asString(); + s.user_prompt = json_body->get("user_prompt", "USER: ").asString(); + s.ai_prompt = json_body->get("ai_prompt", "ASSISTANT: ").asString(); + s.system_prompt = + json_body->get("system_prompt", "ASSISTANT's RULE: ").asString(); + std::vector params = ConvertJsonToParamsVector(*json_body); + params.push_back("--host"); + params.push_back(s.host); + params.push_back("--port"); + params.push_back(std::to_string(s.port)); + + params.push_back("--pooling"); + params.push_back("mean"); + + std::vector v; + v.reserve(params.size() + 1); + auto engine_dir = engine_service_.GetEngineDirPath(kLlamaRepo); + if (engine_dir.has_error()) { + CTL_WRN(engine_dir.error()); + server_map_.erase(model_id); + return; + } + auto exe = (engine_dir.value().first / kLlamaServer).string(); + + v.push_back(exe); + v.insert(v.end(), params.begin(), params.end()); + engine_service_.RegisterEngineLibPath(); + + auto log_path = + (file_manager_utils::GetCortexLogPath() / "logs" / "cortex.log").string(); + CTL_DBG("log: " << log_path); + auto result = cortex::process::SpawnProcess(v, log_path, log_path); + if (result.has_error()) { + CTL_ERR("Fail to spawn process. " << result.error()); + Json::Value error; + error["error"] = "Fail to spawn process"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(error)); + server_map_.erase(model_id); + return; + } + + s.process_info = result.value(); + if (wait_for_server_up(model_id, s.host, s.port)) { + s.start_time = std::chrono::system_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + Json::Value response; + response["status"] = "Model loaded successfully with pid: " + + std::to_string(s.process_info.pid); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response)); + } else { + server_map_.erase(model_id); + Json::Value error; + error["error"] = "Wait for server up timeout"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::UnloadModel(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + + if (server_map_.find(model_id) != server_map_.end()) { + auto& s = server_map_[model_id]; +#if defined(_WIN32) || defined(_WIN64) + auto sent = cortex::process::KillProcess(s.process_info); +#else + auto sent = (kill(s.process_info.pid, SIGTERM) != -1); +#endif + if (sent) { + LOG_INFO << "SIGINT signal sent to child process"; + Json::Value response; + response["status"] = "Model unloaded successfully with pid: " + + std::to_string(s.process_info.pid); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response)); + server_map_.erase(model_id); + } else { + LOG_ERROR << "Failed to send SIGINT signal to child process"; + Json::Value error; + error["error"] = "Failed to unload model: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(error)); + } + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::GetModelStatus(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + Json::Value response; + response["status"] = "Model is loaded"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response)); + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::GetModels(std::shared_ptr json_body, + http_callback&& callback) { + Json::Value json_resp; + Json::Value model_array(Json::arrayValue); + { + for (const auto& [m, s] : server_map_) { + Json::Value val; + val["id"] = m; + val["engine"] = kLlamaEngine; + val["start_time"] = s.start_time; + val["model_size"] = 0u; + val["vram"] = 0u; + val["ram"] = 0u; + val["object"] = "model"; + model_array.append(val); + } + } + + json_resp["object"] = "list"; + json_resp["data"] = model_array; + + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(json_resp)); + CTL_INF("Running models responded"); + (void)json_body; +} + +void LocalEngine::HandleOpenAiChatCompletion( + std::shared_ptr json_body, http_callback&& callback, + const std::string& model) { + CTL_DBG("Hanle OpenAI chat completion"); + auto is_stream = (*json_body).get("stream", false).asBool(); + auto include_usage = [&json_body, is_stream]() -> bool { + if (is_stream) { + if (json_body->isMember("stream_options") && + !(*json_body)["stream_options"].isNull()) { + return (*json_body)["stream_options"] + .get("include_usage", false) + .asBool(); + } + return false; + } + return false; + }(); + + auto n = [&json_body, is_stream]() -> int { + if (is_stream) + return 1; + return (*json_body).get("n", 1).asInt(); + }(); + + auto& s = server_map_.at(model); + // Format logit_bias + if (json_body->isMember("logit_bias")) { + auto logit_bias = ConvertLogitBiasToArray((*json_body)["logit_bias"]); + (*json_body)["logit_bias"] = logit_bias; + } + // llama.cpp server only supports n = 1 + (*json_body)["n"] = 1; + + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ s.host + ":" + std::to_string(s.port), + /*.pathParams*/ {"v1", "chat", "completions"}, + /*.queries*/ {}, + }; + + if (is_stream) { + q_.RunInQueue([s, json_body, callback, model, url = std::move(url)] { + auto curl = curl_easy_init(); + if (!curl) { + CTL_WRN("Failed to initialize CURL"); + return; + } + + curl_easy_setopt(curl, CURLOPT_URL, url.ToFullPath().c_str()); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + CTL_INF(url.ToFullPath()); + + struct curl_slist* headers = nullptr; + headers = curl_slist_append(headers, "Content-Type: application/json"); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + auto json_str = json_body->toStyledString(); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str()); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length()); + curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L); + + StreamingCallback sc; + OaiInfo oi{model, false /*include_usage*/, true /*oai_endpoint*/, + 0 /*n_probs*/}; + sc.callback = std::make_shared(callback); + sc.need_stop = true; + sc.oi = oi; + + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &sc); + auto res = curl_easy_perform(curl); + + if (res != CURLE_OK) { + CTL_WRN("CURL request failed: " << curl_easy_strerror(res)); + + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = true; + status["status_code"] = 500; + + Json::Value error; + error["error"] = curl_easy_strerror(res); + callback(std::move(status), std::move(error)); + } + curl_easy_cleanup(curl); + if (sc.need_stop) { + CTL_DBG("No stop message received, need to stop"); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc.callback)(std::move(status), Json::Value()); + } + }); + + } else { + Json::Value result; + // multiple choices + for (int i = 0; i < n; i++) { + auto response = curl_utils::SimplePostJson(url.ToFullPath(), + json_body->toStyledString()); + + if (response.has_value()) { + auto r = response.value(); + if (i == 0) { + result = r; + } else { + r["choices"][0]["index"] = i; + result["choices"].append(r["choices"][0]); + result["usage"]["completion_tokens"] = + result["usage"]["completion_tokens"].asInt() + + r["usage"]["completion_tokens"].asInt(); + result["usage"]["prompt_tokens"] = + result["usage"]["prompt_tokens"].asInt() + + r["usage"]["prompt_tokens"].asInt(); + result["usage"]["total_tokens"] = + result["usage"]["total_tokens"].asInt() + + r["usage"]["total_tokens"].asInt(); + } + + if (i == n - 1) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(result)); + } + } else { + CTL_WRN("Error: " << response.error()); + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(response.value())); + break; + } + } + } +} + +// (sang) duplicate code but it is easier to clean when +// llama-server upstream is fully OpenAI API Compatible +void LocalEngine::HandleNonOpenAiChatCompletion( + std::shared_ptr json_body, http_callback&& callback, + const std::string& model) { + CTL_DBG("Hanle NonOpenAI chat completion"); + auto is_stream = (*json_body).get("stream", false).asBool(); + auto include_usage = [&json_body, is_stream]() -> bool { + if (is_stream) { + if (json_body->isMember("stream_options") && + !(*json_body)["stream_options"].isNull()) { + return (*json_body)["stream_options"] + .get("include_usage", false) + .asBool(); + } + return false; + } + return false; + }(); + + auto n = [&json_body, is_stream]() -> int { + if (is_stream) + return 1; + return (*json_body).get("n", 1).asInt(); + }(); + + auto& s = server_map_.at(model); + + // Format logit_bias + if (json_body->isMember("logit_bias")) { + auto logit_bias = ConvertLogitBiasToArray((*json_body)["logit_bias"]); + (*json_body)["logit_bias"] = logit_bias; + } + auto get_message = [](const Json::Value& msg_content) -> std::string { + if (msg_content.isArray()) { + for (const auto& mc : msg_content) { + if (mc["type"].asString() == "text") { + return mc["text"].asString(); + } + } + } else { + return msg_content.asString(); + } + return ""; + }; + + if (!json_body->isMember("prompt") || + (*json_body)["prompt"].asString().empty()) { + auto formatted_output = s.pre_prompt; + for (const auto& message : (*json_body)["messages"]) { + auto input_role = message["role"].asString(); + std::string role; + if (input_role == "user") { + role = s.user_prompt; + } else if (input_role == "assistant") { + role = s.ai_prompt; + } else if (input_role == "system") { + role = s.system_prompt; + } else { + role = input_role; + } + + if (auto content = get_message(message["content"]); !content.empty()) { + formatted_output += role + content; + } + } + formatted_output += s.ai_prompt; + (*json_body)["prompt"] = formatted_output; + } + + (*json_body)["n"] = 1; + int n_probs = json_body->get("n_probs", 0).asInt(); + + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ s.host + ":" + std::to_string(s.port), + /*.pathParams*/ {"v1", "completions"}, + /*.queries*/ {}, + }; + + if (is_stream) { + q_.RunInQueue([s, json_body, callback, n_probs, model, + url = std::move(url)] { + auto curl = curl_easy_init(); + if (!curl) { + CTL_WRN("Failed to initialize CURL"); + return; + } + + curl_easy_setopt(curl, CURLOPT_URL, url.ToFullPath().c_str()); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + + struct curl_slist* headers = nullptr; + headers = curl_slist_append(headers, "Content-Type: application/json"); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + auto json_str = json_body->toStyledString(); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str()); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length()); + curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L); + + StreamingCallback sc; + OaiInfo oi{model, false /*include_usage*/, false /*oai_endpoint*/, + n_probs}; + sc.callback = std::make_shared(callback); + sc.need_stop = true; + sc.oi = oi; + + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &sc); + auto res = curl_easy_perform(curl); + + if (res != CURLE_OK) { + CTL_WRN("CURL request failed: " << curl_easy_strerror(res)); + + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = true; + status["status_code"] = 500; + + Json::Value error; + error["error"] = curl_easy_strerror(res); + callback(std::move(status), std::move(error)); + } + curl_easy_cleanup(curl); + if (sc.need_stop) { + CTL_DBG("No stop message received, need to stop"); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc.callback)(std::move(status), Json::Value()); + } + }); + + } else { + + Json::Value result; + int prompt_tokens = 0; + int predicted_tokens = 0; + // multiple choices + for (int i = 0; i < n; i++) { + auto response = curl_utils::SimplePostJson(url.ToFullPath(), + json_body->toStyledString()); + if (response.has_value()) { + auto r = response.value(); + Json::Value logprobs; + prompt_tokens += r["tokens_evaluated"].asInt(); + predicted_tokens += r["tokens_predicted"].asInt(); + std::string to_send = r["content"].asString(); + string_utils::LTrim(to_send); + if (n_probs > 0) { + logprobs = r["completion_probabilities"]; + } + if (i == 0) { + result = CreateFullReturnJson( + GenerateRandomString(20), model, to_send, "_", prompt_tokens, + predicted_tokens, Json::Value("stop"), logprobs); + } else { + auto choice = CreateFullReturnJson( + GenerateRandomString(20), model, to_send, "_", prompt_tokens, + predicted_tokens, Json::Value("stop"), logprobs)["choices"][0]; + choice["index"] = i; + result["choices"].append(choice); + result["usage"]["completion_tokens"] = predicted_tokens; + result["usage"]["prompt_tokens"] = prompt_tokens; + result["usage"]["total_tokens"] = predicted_tokens + prompt_tokens; + } + + if (i == n - 1) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(result)); + } + } else { + CTL_WRN("Error: " << response.error()); + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(response.value())); + break; + } + } + } +} + +} // namespace cortex::local diff --git a/engine/extensions/local-engine/local_engine.h b/engine/extensions/local-engine/local_engine.h new file mode 100644 index 000000000..6dd970799 --- /dev/null +++ b/engine/extensions/local-engine/local_engine.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include +#include +#include +#include "cortex-common/EngineI.h" +#include "json/json.h" +#include "services/engine_service.h" +#include "utils/process/utils.h" +#include "utils/task_queue.h" + +namespace cortex::local { +using http_callback = std::function; + +struct ServerAddress { + std::string host; + int port; + cortex::process::ProcessInfo process_info; + std::string pre_prompt; + std::string user_prompt; + std::string ai_prompt; + std::string system_prompt; + uint64_t start_time; +}; + +class LocalEngine : public EngineI { + public: + LocalEngine(EngineService& engine_service, TaskQueue& q) + : engine_service_(engine_service), q_(q) {} + ~LocalEngine(); + + void Load(EngineLoadOption opts) final {} + + void Unload(EngineUnloadOption opts) final {} + + void HandleChatCompletion(std::shared_ptr json_body, + http_callback&& callback) final; + void HandleEmbedding(std::shared_ptr json_body, + http_callback&& callback) final; + void LoadModel(std::shared_ptr json_body, + http_callback&& callback) final; + void UnloadModel(std::shared_ptr json_body, + http_callback&& callback) final; + void GetModelStatus(std::shared_ptr json_body, + http_callback&& callback) final; + + // Get list of running models + void GetModels(std::shared_ptr jsonBody, + http_callback&& callback) final; + + bool SetFileLogger(int max_log_lines, const std::string& log_path) final { + return true; + } + void SetLogLevel(trantor::Logger::LogLevel logLevel) final {} + + // Stop inflight chat completion in stream mode + void StopInferencing(const std::string& model_id) final {} + + private: + void HandleOpenAiChatCompletion(std::shared_ptr json_body, + http_callback&& callback, + const std::string& model); + + void HandleNonOpenAiChatCompletion(std::shared_ptr json_body, + http_callback&& callback, + const std::string& model); + + private: + std::unordered_map server_map_; + EngineService& engine_service_; + TaskQueue& q_; +}; + +} // namespace cortex::local diff --git a/engine/main.cc b/engine/main.cc index ab4e74857..abde0441b 100644 --- a/engine/main.cc +++ b/engine/main.cc @@ -196,15 +196,16 @@ void RunServer(bool ignore_cout) { auto config_service = std::make_shared(); auto download_service = std::make_shared(event_queue_ptr, config_service); + auto task_queue = std::make_shared( + std::min(2u, std::thread::hardware_concurrency()), "background_task"); auto engine_service = std::make_shared( - download_service, dylib_path_manager, db_service); + download_service, dylib_path_manager, db_service, task_queue); auto inference_svc = std::make_shared(engine_service); auto model_src_svc = std::make_shared(db_service); - cortex::TaskQueue task_queue( - std::min(2u, std::thread::hardware_concurrency()), "background_task"); - auto model_service = - std::make_shared(db_service, hw_service, download_service, - inference_svc, engine_service, task_queue); + + auto model_service = std::make_shared( + db_service, hw_service, download_service, inference_svc, engine_service, + *task_queue); inference_svc->SetModelService(model_service); auto file_watcher_srv = std::make_shared( diff --git a/engine/repositories/file_fs_repository.cc b/engine/repositories/file_fs_repository.cc index f5b349f45..67c0981ba 100644 --- a/engine/repositories/file_fs_repository.cc +++ b/engine/repositories/file_fs_repository.cc @@ -18,14 +18,10 @@ std::filesystem::path SanitizePath(const std::filesystem::path& user_input, std::filesystem::path resolved_path = std::filesystem::weakly_canonical( std::filesystem::path(basedir) / std::filesystem::path(user_input)); /* Ensure the resolved path is within our basedir */ - for (auto p = resolved_path; !p.empty(); p = p.parent_path()) { - if (std::filesystem::equivalent(p, abs_base)) { - return resolved_path; - } - if (p == p.parent_path()) { // reached the root directory - break; - } + if (resolved_path.string().find(abs_base.string()) != std::string::npos) { + return resolved_path; } + return {}; } diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 48cc6ff37..89cd00058 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -9,6 +9,7 @@ #include "config/model_config.h" #include "database/engines.h" #include "database/models.h" +#include "extensions/local-engine/local_engine.h" #include "extensions/remote-engine/remote_engine.h" #include "utils/archive_utils.h" @@ -16,6 +17,7 @@ #include "utils/engine_matcher_utils.h" #include "utils/file_manager_utils.h" #include "utils/github_release_utils.h" +#include "utils/hardware/os_info.h" #include "utils/logging_utils.h" #include "utils/normalize_engine.h" #include "utils/result.hpp" @@ -46,13 +48,6 @@ std::string Repo2Engine(const std::string& r) { } return r; }; - -std::string GetEnginePath(std::string_view e) { - if (e == kLlamaRepo) { - return kLlamaLibPath; - } - return kLlamaLibPath; -}; } // namespace cpp::result EngineService::InstallEngineAsync( @@ -236,11 +231,14 @@ cpp::result EngineService::DownloadEngine( auto latest_version_semantic = normalized_version == "latest" ? res.value()[0].version : normalized_version; - auto merged_variant_name = engine + "-" + latest_version_semantic + "-" + - variant_name.value() + ".tar.gz"; + std::unordered_set merged_variant_name = { + "llama-" + latest_version_semantic + "-bin-" + variant_name.value() + + ".tar.gz", // menlo + "llama-" + latest_version_semantic + "-bin-" + variant_name.value() + + ".zip"}; // ggml for (const auto& asset : res.value()) { - if (asset.name == merged_variant_name) { + if (merged_variant_name.find(asset.name) != merged_variant_name.end()) { selected_variant = asset; break; } @@ -275,43 +273,96 @@ cpp::result EngineService::DownloadEngine( } } - auto normalize_version = "v" + selected_variant->version; auto variant_folder_name = engine_matcher_utils::GetVariantFromNameAndVersion( selected_variant->name, engine, selected_variant->version); auto variant_folder_path = file_manager_utils::GetEnginesContainerPath() / engine / variant_folder_name.value() / - normalize_version; + selected_variant->version; auto variant_path = variant_folder_path / selected_variant->name; std::filesystem::create_directories(variant_folder_path); CTL_INF("variant_folder_path: " + variant_folder_path.string()); - auto on_finished = [this, engine, selected_variant, variant_folder_path, - normalize_version](const DownloadTask& finishedTask) { + auto on_finished = [this, engine, selected_variant, + variant_folder_path](const DownloadTask& finishedTask) { // try to unzip the downloaded file CTL_INF("Engine zip path: " << finishedTask.items[0].localPath.string()); - CTL_INF("Version: " + normalize_version); + CTL_INF("Version: " + selected_variant->version); auto extract_path = finishedTask.items[0].localPath.parent_path(); archive_utils::ExtractArchive(finishedTask.items[0].localPath.string(), extract_path.string(), true); - + CTL_INF("local path: " << finishedTask.items[0].localPath.string() + << ", extract path: " << extract_path.string()); auto variant = engine_matcher_utils::GetVariantFromNameAndVersion( - selected_variant->name, engine, normalize_version); - + selected_variant->name, engine, selected_variant->version); CTL_INF("Extracted variant: " + variant.value()); - // set as default + try { + // Create version file + std::ofstream meta(extract_path / "version.txt", std::ios::out); + meta << "name: " << variant.value() << std::endl; + meta << "version: " << selected_variant->version << std::endl; + meta.close(); + + std::filesystem::path bin_path = extract_path / "build" / "bin"; + if (std::filesystem::exists(bin_path)) { + for (const auto& entry : + std::filesystem::directory_iterator(bin_path)) { + if (entry.is_regular_file()) { + std::filesystem::path target_file = + extract_path / entry.path().filename(); + std::filesystem::copy_file( + entry.path(), target_file, + std::filesystem::copy_options::overwrite_existing); + } + } + std::filesystem::remove_all(bin_path.parent_path()); + } + if (!std::filesystem::exists(extract_path.parent_path().parent_path() / + "deps")) { + std::filesystem::create_directory( + extract_path.parent_path().parent_path() / "deps"); + } + std::filesystem::permissions(extract_path / kLlamaServer, + std::filesystem::perms::owner_exec | + std::filesystem::perms::group_exec | + std::filesystem::perms::others_exec, + std::filesystem::perm_options::add); + + const std::vector windows_deps = { + "msvcp140.dll", "vcruntime140.dll", "vcruntime140_1.dll"}; + for (auto const& win_dep : windows_deps) { + if (std::filesystem::exists( + file_manager_utils::GetExecutableFolderContainerPath() / + win_dep)) { + CTL_INF("Copy file " + << (file_manager_utils::GetExecutableFolderContainerPath() / + win_dep) + .string() + << " to " << extract_path.string()); + std::filesystem::copy_file( + file_manager_utils::GetExecutableFolderContainerPath() / win_dep, + extract_path / win_dep, + std::filesystem::copy_options::overwrite_existing); + } + } + + } catch (const std::exception& e) { + CTL_INF(e.what()); + } - auto res = - SetDefaultEngineVariant(engine, normalize_version, variant.value()); + // set as default + auto res = SetDefaultEngineVariant(engine, selected_variant->version, + variant.value()); if (res.has_error()) { CTL_ERR("Failed to set default engine variant: " << res.error()); } else { CTL_INF("Set default engine variant: " << res.value().variant); } - auto create_res = EngineService::UpsertEngine( - engine, // engine_name - kLocal, "", "", normalize_version, variant.value(), "Default", ""); + auto create_res = + EngineService::UpsertEngine(engine, // engine_name + kLocal, "", "", selected_variant->version, + variant.value(), "Default", ""); if (create_res.has_error()) { CTL_ERR("Failed to create engine entry: " << create_res->engine_name); @@ -322,7 +373,7 @@ cpp::result EngineService::DownloadEngine( for (const auto& entry : std::filesystem::directory_iterator( variant_folder_path.parent_path())) { if (entry.is_directory() && - entry.path().filename() != normalize_version) { + entry.path().filename() != selected_variant->version) { try { std::filesystem::remove_all(entry.path()); } catch (const std::exception& e) { @@ -450,7 +501,26 @@ std::string EngineService::GetMatchedVariant( cpp::result, std::string> EngineService::GetEngineReleases(const std::string& engine) const { auto ne = cortex::engine::NormalizeEngine(engine); - return github_release_utils::GetReleases("menloresearch", ne); + auto ggml_org = github_release_utils::GetReleases(kGgmlOrg, ne); + auto menlo = github_release_utils::GetReleases(kMenloOrg, ne); + if (ggml_org.has_error() && menlo.has_error()) { + return cpp::fail(ggml_org.error()); + } + auto comparator = [](const EngineService::EngineRelease& e1, + const EngineService::EngineRelease& e2) { + return e1.name > e2.name; + }; + std::set s(comparator); + if (ggml_org.has_value()) { + s.insert(ggml_org.value().begin(), ggml_org.value().end()); + } + + if (menlo.has_value()) { + s.insert(menlo.value().begin(), menlo.value().end()); + } + std::vector res; + std::copy(s.begin(), s.end(), std::back_inserter(res)); + return res; } cpp::result, std::string> @@ -458,16 +528,85 @@ EngineService::GetEngineVariants(const std::string& engine, const std::string& version, bool filter_compatible_only) const { auto ne = cortex::engine::NormalizeEngine(engine); - auto engine_release = - github_release_utils::GetReleaseByVersion("menloresearch", ne, version); + auto engine_release_menlo = + github_release_utils::GetReleaseByVersion(kMenloOrg, ne, version); + auto engine_release_ggml = + github_release_utils::GetReleaseByVersion(kGgmlOrg, ne, version); + + if (engine_release_menlo.has_error() && engine_release_ggml.has_error()) { + return cpp::fail("Failed to get engine release: " + + engine_release_menlo.error()); + } + if (engine_release_menlo.has_error()) { + CTL_WRN("Failed to get engine release: " << engine_release_menlo.error()); + } - if (engine_release.has_error()) { - return cpp::fail("Failed to get engine release: " + engine_release.error()); + if (engine_release_ggml.has_error()) { + CTL_WRN("Failed to get engine release: " << engine_release_ggml.error()); } std::vector compatible_variants; - for (const auto& variant : engine_release.value().assets) { - if (variant.content_type != "application/gzip") { + std::vector assets; + + auto get_os_major = []() -> int { + auto os_info = cortex::hw::GetOSInfo(); + // Get os major version + size_t dot_pos = os_info.version.find_first_of("."); + if (dot_pos != std::string::npos) { + try { + return std::stoi(os_info.version.substr(0, dot_pos)); + } catch (const std::exception& e) { + return 0; + } + } else { + // No version found + return 0; + } + }; + + if (engine_release_menlo.has_value()) { + // In case of macos, if os version is 12, we get binary from menlo + std::copy_if( + engine_release_menlo.value().assets.begin(), + engine_release_menlo.value().assets.end(), std::back_inserter(assets), + [get_os_major](const github_release_utils::GitHubAsset& assets) { +#if defined(__APPLE__) && defined(__MACH__) + if ((assets.name.find(kMacOs) == std::string::npos) || + (get_os_major() <= 12 && + assets.name.find(kMacOs) != std::string::npos)) { + return true; + } + return false; +#else + return true; +#endif + }); + } + + if (engine_release_ggml.has_value()) { + // In case of macos, if os version is 12, we get binary from menlo + std::copy_if( + engine_release_ggml.value().assets.begin(), + engine_release_ggml.value().assets.end(), std::back_inserter(assets), + [get_os_major](const github_release_utils::GitHubAsset& assets) { +#if defined(__APPLE__) && defined(__MACH__) + if ((assets.name.find(kMacOs) == std::string::npos) || + (get_os_major() > 12 && + assets.name.find(kMacOs) != std::string::npos)) { + return true; + } + return false; +#else + return true; +#endif + }); + } + + for (const auto& variant : assets) { + CTL_INF("content_type: " << variant.content_type + << ", name: " << variant.name); + if (variant.content_type != "application/gzip" && + variant.content_type != "application/json; charset=utf-8") { continue; } if (variant.state != "uploaded") { @@ -494,30 +633,29 @@ EngineService::GetEngineVariants(const std::string& engine, name.find("mac") != std::string::npos) os_match = true; if (system_info->os == "windows" && - name.find("windows") != std::string::npos) + name.find("win") != std::string::npos) os_match = true; if (system_info->os == "linux" && - name.find("linux") != std::string::npos) + (name.find("linux") != std::string::npos || + name.find("ubuntu") != std::string::npos)) os_match = true; bool arch_match = false; if (system_info->arch == "arm64" && name.find("arm64") != std::string::npos) arch_match = true; - if (system_info->arch == "amd64" && - name.find("amd64") != std::string::npos) + if (system_info->arch == "x64" && + name.find("x64") != std::string::npos) arch_match = true; return !(os_match && arch_match); }), compatible_variants.end()); - if (compatible_variants.empty()) { return cpp::fail("No compatible variants found for system " + system_info->os + "/" + system_info->arch); } } - return compatible_variants; } @@ -550,7 +688,7 @@ EngineService::SetDefaultEngineVariant(const std::string& engine, auto normalized_version = string_utils::RemoveSubstring(version, "v"); auto config = file_manager_utils::GetCortexConfig(); - config.llamacppVersion = "v" + normalized_version; + config.llamacppVersion = normalized_version; config.llamacppVariant = variant; auto result = file_manager_utils::UpdateCortexConfig(config); if (result.has_error()) { @@ -574,10 +712,10 @@ cpp::result EngineService::IsEngineVariantReady( return cpp::fail(installed_engines.error()); } - CLI_LOG("IsEngineVariantReady: " << ne << ", " << normalized_version << ", " + CTL_INF("IsEngineVariantReady: " << ne << ", " << normalized_version << ", " << variant); for (const auto& installed_engine : installed_engines.value()) { - CLI_LOG("Installed: name: " + installed_engine.name + + CTL_INF("Installed: name: " + installed_engine.name + ", version: " + installed_engine.version); if ((installed_engine.name == variant && installed_engine.version == normalized_version) || @@ -640,10 +778,10 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const { try { auto node = YAML::LoadFile(version_txt_path.string()); auto ev = EngineVariantResponse{ - node["name"].as(), // name - "v" + node["version"].as(), // version - engine, // engine - "", // type + node["name"].as(), // name + node["version"].as(), // version + engine, // engine + "", // type }; variants.push_back(ev); } catch (const YAML::Exception& e) { @@ -696,76 +834,18 @@ cpp::result EngineService::LoadEngine( } return {}; } - - // End hard code - - CTL_INF("Loading engine: " << ne); + if (engines_.find(ne) == engines_.end()) { + CTL_INF("Loading local engine: " << engine_name); #if defined(_WIN32) || defined(_WIN64) || defined(__linux__) - CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string()); + CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string()); #endif - - auto engine_dir_path_res = GetEngineDirPath(ne); - if (engine_dir_path_res.has_error()) { - return cpp::fail(engine_dir_path_res.error()); + engines_[ne].engine = new cortex::local::LocalEngine(*this, *(q_.get())); + CTL_INF("Loaded engine: " << engine_name); + } else { + CTL_INF("Engine has already been loaded: " << engine_name); } - auto engine_dir_path = engine_dir_path_res.value().first; - auto custom_engine_path = engine_dir_path_res.value().second; - - try { - auto cuda_path = file_manager_utils::GetCudaToolkitPath(ne); - -#if defined(_WIN32) || defined(_WIN64) - // register deps - if (!(getenv("ENGINE_PATH"))) { - std::vector paths{}; - paths.push_back(cuda_path); - paths.push_back(engine_dir_path); - - CTL_DBG("Registering dylib for " - << ne << " with " << std::to_string(paths.size()) << " paths."); - for (const auto& path : paths) { - CTL_DBG("Registering path: " << path.string()); - } - - auto reg_result = dylib_path_manager_->RegisterPath(ne, paths); - if (reg_result.has_error()) { - CTL_DBG("Failed register lib paths for: " << ne); - } else { - CTL_DBG("Registered lib paths for: " << ne); - } - } -#endif - auto dylib = - std::make_unique(engine_dir_path.string(), "engine"); - - auto config = file_manager_utils::GetCortexConfig(); - auto log_path = std::filesystem::path(config.logFolderPath) / - std::filesystem::path(config.logLlamaCppPath); - - // init - auto func = dylib->get_function("get_engine"); - auto engine_obj = func(); - auto load_opts = EngineI::EngineLoadOption{ - /* .engine_path = */ engine_dir_path, - /* .deps_path = */ cuda_path, - /* .is_custom_engine_path = */ custom_engine_path, - /* .log_path = */ log_path, - /* .max_log_lines = */ config.maxLogLines, - /* .log_level = */ logging_utils_helper::global_log_level, - }; - engine_obj->Load(load_opts); - - engines_[ne].engine = engine_obj; - engines_[ne].dl = std::move(dylib); - - CTL_DBG("Engine loaded: " << ne); - return {}; - } catch (const cortex_cpp::dylib::load_error& e) { - CTL_ERR("Could not load engine: " << e.what()); - engines_.erase(ne); - return cpp::fail("Could not load engine " + ne + ": " + e.what()); - } + return {}; } void EngineService::RegisterEngineLibPath() { @@ -796,7 +876,8 @@ void EngineService::RegisterEngineLibPath() { auto reg_result = dylib_path_manager_->RegisterPath(ne, paths); if (reg_result.has_error()) { - CTL_WRN("Failed register lib path for " << engine); + CTL_WRN("Failed register lib path for " + << engine << ", error: " << reg_result.error()); } else { CTL_DBG("Registered lib path for " << engine); } @@ -829,8 +910,8 @@ EngineService::GetEngineDirPath(const std::string& engine_name) { CTL_DBG("user defined engine path: " << user_defined_engine_path); const std::filesystem::path engine_dir_path = [&] { if (user_defined_engine_path != nullptr) { - return std::filesystem::path(user_defined_engine_path) / - GetEnginePath(ne) / selected_engine_variant->variant / + return std::filesystem::path(user_defined_engine_path) / kLlamaLibPath / + selected_engine_variant->variant / selected_engine_variant->version; } else { return file_manager_utils::GetEnginesContainerPath() / ne / @@ -891,8 +972,7 @@ std::vector EngineService::GetLoadedEngines() { cpp::result EngineService::GetLatestEngineVersion(const std::string& engine) const { auto ne = cortex::engine::NormalizeEngine(engine); - auto res = - github_release_utils::GetReleaseByVersion("menloresearch", ne, "latest"); + auto res = github_release_utils::GetReleaseByVersion(kMenloOrg, ne, "latest"); if (res.has_error()) { return cpp::fail("Failed to fetch engine " + engine + " latest version!"); } diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h index 7e6be74c5..0be1fff64 100644 --- a/engine/services/engine_service.h +++ b/engine/services/engine_service.h @@ -19,6 +19,7 @@ #include "utils/github_release_utils.h" #include "utils/result.hpp" #include "utils/system_info_utils.h" +#include "utils/task_queue.h" struct EngineUpdateResult { std::string engine; @@ -44,7 +45,6 @@ class EngineService : public EngineServiceI { using EngineVariant = github_release_utils::GitHubAsset; struct EngineInfo { - std::unique_ptr dl; EngineV engine; }; @@ -60,12 +60,13 @@ class EngineService : public EngineServiceI { }; HardwareInfo hw_inf_; std::shared_ptr db_service_ = nullptr; + std::shared_ptr q_ = nullptr; public: - explicit EngineService( - std::shared_ptr download_service, - std::shared_ptr dylib_path_manager, - std::shared_ptr db_service) + EngineService(std::shared_ptr download_service, + std::shared_ptr dylib_path_manager, + std::shared_ptr db_service, + std::shared_ptr q) : download_service_{download_service}, dylib_path_manager_{dylib_path_manager}, hw_inf_{ @@ -74,9 +75,17 @@ class EngineService : public EngineServiceI { system_info_utils::GetDriverAndCudaVersion() .second // cuda_driver_version. }, + db_service_(db_service), + q_(q) {} - db_service_(db_service) {} - + EngineService(std::shared_ptr dylib_path_manager) + : dylib_path_manager_(dylib_path_manager), + hw_inf_{ + system_info_utils::GetSystemInfo(), // sys_inf. + {}, // cpu_info. + system_info_utils::GetDriverAndCudaVersion() + .second // cuda_driver_version. + } {} std::vector GetEngineInfoList() const; /** @@ -159,6 +168,9 @@ class EngineService : public EngineServiceI { bool IsRemoteEngine(const std::string& engine_name) const override; + cpp::result, std::string> + GetEngineDirPath(const std::string& engine_name); + private: bool IsEngineLoaded(const std::string& engine); @@ -172,9 +184,6 @@ class EngineService : public EngineServiceI { std::string GetMatchedVariant(const std::string& engine, const std::vector& variants); - cpp::result, std::string> - GetEngineDirPath(const std::string& engine_name); - cpp::result IsEngineVariantReady( const std::string& engine, const std::string& version, const std::string& variant); diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc index f0ccadb28..fb2f841be 100644 --- a/engine/services/hardware_service.cc +++ b/engine/services/hardware_service.cc @@ -203,10 +203,8 @@ bool HardwareService::Restart(const std::string& host, int port) { #else std::vector commands; // Some engines requires to add lib search path before process being created - auto download_srv = std::make_shared(); - auto dylib_path_mng = std::make_shared(); - auto db_srv = std::make_shared(); - EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath(); + EngineService(std::make_shared()) + .RegisterEngineLibPath(); std::string p = cortex_utils::GetCurrentPath() / exe; commands.push_back(p); commands.push_back("--ignore_cout"); diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index a1646495b..75d95f06d 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -12,7 +12,9 @@ cpp::result InferenceService::HandleChatCompletion( } else { engine_type = (*(json_body)).get("engine", kLlamaRepo).asString(); } + CTL_DBG("engine_type: " << engine_type); function_calling_utils::PreprocessRequest(json_body); + CTL_DBG("engine_type: " << engine_type); auto tool_choice = json_body->get("tool_choice", Json::Value::null); auto model_id = json_body->get("model", "").asString(); if (saved_models_.find(model_id) != saved_models_.end()) { @@ -32,6 +34,7 @@ cpp::result InferenceService::HandleChatCompletion( } } } + CTL_DBG("engine_type: " << engine_type); auto engine_result = engine_service_->GetLoadedEngine(engine_type); if (engine_result.has_error()) { @@ -275,9 +278,7 @@ InferResult InferenceService::GetModels( for (const auto& loaded_engine : loaded_engines) { if (std::holds_alternative(loaded_engine)) { auto e = std::get(loaded_engine); - if (e->IsSupported("GetModels")) { - e->GetModels(json_body, std::move(cb)); - } + e->GetModels(json_body, std::move(cb)); } else { std::get(loaded_engine) ->GetModels(json_body, std::move(cb)); @@ -302,10 +303,8 @@ bool InferenceService::StopInferencing(const std::string& engine_name, if (std::holds_alternative(engine_result.value())) { auto engine = std::get(engine_result.value()); - if (engine->IsSupported("StopInferencing")) { - engine->StopInferencing(model_id); - CTL_INF("Stopped inferencing"); - } + engine->StopInferencing(model_id); + CTL_INF("Stopped inferencing"); } return true; } diff --git a/engine/test/components/test_engine_matcher_utils.cc b/engine/test/components/test_engine_matcher_utils.cc index 1d1ed47a8..2c24a9b6f 100644 --- a/engine/test/components/test_engine_matcher_utils.cc +++ b/engine/test/components/test_engine_matcher_utils.cc @@ -6,125 +6,78 @@ class EngineMatcherUtilsTestSuite : public ::testing::Test { protected: const std::vector cortex_llamacpp_variants{ - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-vulkan.tar.gz", - "cortex.llamacpp-0.1.43-linux-arm64.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-mac-arm64.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-vulkan.tar.gz", + "llama-b4920-bin-ubuntu-arm64.zip", + "llama-b4920-bin-linux-avx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-avx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-linux-avx-x64.tar.gz", + "llama-b4920-bin-linux-avx2-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-avx2-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-ubuntu-x64.tar.gz", + "llama-b4920-bin-linux-avx512-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-avx512-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-linux-avx512-x64.tar.gz", + "llama-b4920-bin-linux-noavx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-noavx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-linux-noavx-x64.tar.gz", + "llama-b4920-bin-ubuntu-vulkan-x64.tar.gz", + "llama-b4920-bin-macos-arm64.zip", + "llama-b4920-bin-macos-x64.zip", + "llama-b4920-bin-win-avx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-avx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-avx-x64.zip", + "llama-b4920-bin-win-avx2-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-avx2-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-avx2-x64.zip", + "llama-b4920-bin-win-avx512-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-avx512-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-avx512-x64.zip", + "llama-b4920-bin-win-noavx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-noavx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-noavx-x64.zip", + "llama-b4920-bin-win-vulkan-x64.zip", }; - - const std::vector cortex_tensorrt_variants{ - "cortex.tensorrt-llm-0.0.9-linux-cuda-12-4.tar.gz", - "cortex.tensorrt-llm-0.0.9-windows-cuda-12-4.tar.gz"}; - - const std::vector cortex_onnx_variants{ - "cortex.onnx-0.1.7-windows-amd64.tar.gz"}; }; -TEST_F(EngineMatcherUtilsTestSuite, TestValidateOnnx) { - - { - auto expect_matched_variant = cortex_onnx_variants[0]; - auto result = engine_matcher_utils::ValidateOnnx(cortex_onnx_variants, - "windows", "amd64"); - - EXPECT_EQ(result, expect_matched_variant); - } - - { - // should return an empty variant because no variant matched - auto expect_matched_variant{""}; - auto windows_arm_result = engine_matcher_utils::ValidateOnnx( - cortex_onnx_variants, "windows", "arm"); - auto mac_arm64_result = engine_matcher_utils::ValidateOnnx( - cortex_onnx_variants, "mac", "arm64"); - - EXPECT_EQ(windows_arm_result, expect_matched_variant); - EXPECT_EQ(mac_arm64_result, expect_matched_variant); - } -} - -TEST_F(EngineMatcherUtilsTestSuite, TestValidateTensorrt) { - +TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { { - auto windows_expect_matched_variant{cortex_tensorrt_variants[1]}; - auto linux_expect_matched_variant{cortex_tensorrt_variants[0]}; - auto windows{"windows"}; - auto linux{"linux"}; + auto os{"win"}; + auto cpu_arch{"x64"}; + auto suitable_avx{"avx2"}; auto cuda_version{"12.4"}; - auto windows_result = engine_matcher_utils::ValidateTensorrtLlm( - cortex_tensorrt_variants, windows, cuda_version); - auto linux_result = engine_matcher_utils::ValidateTensorrtLlm( - cortex_tensorrt_variants, linux, cuda_version); - EXPECT_EQ(windows_result, windows_expect_matched_variant); - EXPECT_EQ(linux_result, linux_expect_matched_variant); - } - - { // macos is not supported - auto os = "mac"; - auto cuda_version{"12.4"}; + auto variant = engine_matcher_utils::Validate( + cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - auto result = engine_matcher_utils::ValidateTensorrtLlm( - cortex_tensorrt_variants, os, cuda_version); - EXPECT_EQ(result, ""); + EXPECT_EQ(variant, "llama-b4920-bin-win-avx2-cuda-cu12.0-x64.tar.gz"); } -} -TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { { - auto os{"windows"}; - auto cpu_arch{"amd64"}; - auto suitable_avx{"avx2"}; - auto cuda_version{"12.4"}; + auto os{"mac"}; + auto cpu_arch{"x64"}; + auto suitable_avx{""}; + auto cuda_version{""}; auto variant = engine_matcher_utils::Validate( cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - EXPECT_EQ( - variant, - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-12-0.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-macos-x64.zip"); } { auto os{"mac"}; - auto cpu_arch{"amd64"}; + auto cpu_arch{"arm64"}; auto suitable_avx{""}; auto cuda_version{""}; auto variant = engine_matcher_utils::Validate( cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - EXPECT_EQ(variant, "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-macos-arm64.zip"); } { - auto os{"windows"}; - auto cpu_arch{"amd64"}; + auto os{"win"}; + auto cpu_arch{"x64"}; auto suitable_avx{"avx2"}; auto cuda_version{"10"}; @@ -132,8 +85,7 @@ TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); // fallback to no cuda version - EXPECT_EQ(variant, - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-win-avx2-x64.zip"); } { @@ -145,30 +97,43 @@ TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { auto variant = engine_matcher_utils::Validate( cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - EXPECT_EQ(variant, "cortex.llamacpp-0.1.43-linux-arm64.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-ubuntu-arm64.zip"); } } TEST_F(EngineMatcherUtilsTestSuite, TestGetVersionAndArch) { { - std::string variant = - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-11-7.tar.gz"; + std::string variant = "llama-b4920-bin-linux-avx-cuda-cu11.7-x64.tar.gz"; + auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "linux-avx-cuda-cu11.7-x64"); + } + + { + std::string variant = "llama-b4920-bin-ubuntu-arm64.zip"; + auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "ubuntu-arm64"); + } + + { + std::string variant = "llama-b4920-bin-win-avx2-x64.zip"; auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); - EXPECT_EQ(version, "v0.1.25-25.08.24"); - EXPECT_EQ(arch, "linux-amd64-avx-cuda-11-7"); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "win-avx2-x64"); } { - std::string variant = "cortex.llamacpp-0.1.25-windows-amd64-avx2.tar.gz"; + std::string variant = "llama-b4920-bin-macos-x64.tar.gz"; auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); - EXPECT_EQ(version, "v0.1.25"); - EXPECT_EQ(arch, "windows-amd64-avx2"); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "macos-x64"); } { - std::string variant = "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz"; + std::string variant = "llama-b4920-bin-ubuntu-vulkan-x64.zip"; auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); - EXPECT_EQ(version, "v0.1.25-25.08.24"); - EXPECT_EQ(arch, "mac-amd64"); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "ubuntu-vulkan-x64"); } } diff --git a/engine/test/components/test_github_release_utils.cc b/engine/test/components/test_github_release_utils.cc index ae1e2c7c2..20c14b187 100644 --- a/engine/test/components/test_github_release_utils.cc +++ b/engine/test/components/test_github_release_utils.cc @@ -4,16 +4,16 @@ class GitHubReleaseUtilsTest : public ::testing::Test {}; TEST_F(GitHubReleaseUtilsTest, AbleToGetReleaseByVersion) { - auto version{"v0.1.36"}; + auto version{"b4920"}; auto result = github_release_utils::GetReleaseByVersion( - "menloresearch", "cortex.llamacpp", version); + kMenloOrg, "llama.cpp", version); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result->tag_name, version); } TEST_F(GitHubReleaseUtilsTest, AbleToGetReleaseList) { - auto result = github_release_utils::GetReleases("menloresearch", "cortex.llamacpp"); + auto result = github_release_utils::GetReleases(kMenloOrg, "llama.cpp"); ASSERT_TRUE(result.has_value()); ASSERT_TRUE(result->size() > 0); diff --git a/engine/test/components/test_string_utils.cc b/engine/test/components/test_string_utils.cc index 42211b668..e12046136 100644 --- a/engine/test/components/test_string_utils.cc +++ b/engine/test/components/test_string_utils.cc @@ -288,6 +288,47 @@ TEST_F(StringUtilsTestSuite, LargeInputPerformance) { EXPECT_EQ(RemoveSubstring(large_input, to_remove), ""); } +TEST(LTrimTest, EmptyString) { + std::string s = ""; + LTrim(s); + EXPECT_EQ(s, ""); +} + +TEST(LTrimTest, NoSpaces) { + std::string s = "HelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, LeadingSpaces) { + std::string s = " HelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, LeadingTabs) { + std::string s = "\t\tHelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, LeadingNewlines) { + std::string s = "\n\nHelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, OnlySpaces) { + std::string s = " "; + LTrim(s); + EXPECT_EQ(s, ""); +} + +TEST(LTrimTest, MixedSpaces) { + std::string s = " \t\nHelloWorld "; + LTrim(s); + EXPECT_EQ(s, "HelloWorld "); +} TEST_F(StringUtilsTestSuite, UrlPaths_SimilarStrings) { std::string str1 = "/v1/threads/{1}/messages/{2}"; diff --git a/engine/utils/cuda_toolkit_utils.h b/engine/utils/cuda_toolkit_utils.h index 748af1bd3..e7aadfdd6 100644 --- a/engine/utils/cuda_toolkit_utils.h +++ b/engine/utils/cuda_toolkit_utils.h @@ -7,32 +7,7 @@ inline std::string GetCompatibleCudaToolkitVersion( const std::string& driver_semantic_version, const std::string& os, const std::string& engine) { - if (engine == "cortex.tensorrt-llm") { - // if the engine is cortex.tensorrt-llm, the minimum required CUDA version is 12.4 - if (os == "windows") { - if (semantic_version_utils::CompareSemanticVersion( - driver_semantic_version, "527.41") >= 0) { - return "12.4"; - } else { - throw std::runtime_error( - "GPU driver version not supported. Minimum " - "required driver version is 527.41"); - } - } else if (os == "linux") { - if (semantic_version_utils::CompareSemanticVersion( - driver_semantic_version, "525.60.13") >= 0) { - return "12.4"; - } else { - throw std::runtime_error( - "GPU driver version not supported. Minimum required driver version " - "is 525.60.13"); - } - } else { - throw std::runtime_error("Unsupported OS"); - } - } - - if (os == "windows") { + if (os == "windows" || os == "win") { if (semantic_version_utils::CompareSemanticVersion(driver_semantic_version, "527.41") >= 0) { return "12.4"; @@ -44,7 +19,7 @@ inline std::string GetCompatibleCudaToolkitVersion( "GPU driver version not supported. Minimum " "required driver version is 452.39"); } - } else if (os == "linux") { + } else if (os == "linux" || os == "ubuntu") { if (semantic_version_utils::CompareSemanticVersion(driver_semantic_version, "525.60.13") >= 0) { return "12.4"; diff --git a/engine/utils/dylib_path_manager.cc b/engine/utils/dylib_path_manager.cc index 7c389df06..878620185 100644 --- a/engine/utils/dylib_path_manager.cc +++ b/engine/utils/dylib_path_manager.cc @@ -26,7 +26,7 @@ cpp::result DylibPathManager::RegisterPath( } return cpp::fail("Failed to add DLL directory: " + path.string()); } else { - CTL_DBG("Added DLL directory: " << path.string()); + CTL_INF("Added DLL directory: " << path.string()); } dylib_paths.push_back({path, cookie}); diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h index 2c5cd1be3..695afb4c5 100644 --- a/engine/utils/engine_constants.h +++ b/engine/utils/engine_constants.h @@ -5,20 +5,23 @@ constexpr const auto kLlamaEngine = "llama-cpp"; constexpr const auto kRemote = "remote"; constexpr const auto kLocal = "local"; +constexpr const auto kLlamaRepo = "llama.cpp"; +constexpr const auto kLlamaLibPath = "./engines/llama.cpp"; +constexpr const auto kLlamaServer = "llama-server"; -constexpr const auto kLlamaRepo = "cortex.llamacpp"; - -constexpr const auto kLlamaLibPath = "./engines/cortex.llamacpp"; +constexpr const auto kMenloOrg = "menloresearch"; +constexpr const auto kGgmlOrg = "ggml-org"; // other constants constexpr auto static kHuggingFaceHost = "huggingface.co"; constexpr auto static kGitHubHost = "api.github.com"; constexpr auto static kCortexFolderName = "cortexcpp"; -constexpr auto static kDefaultGHUserAgent = "menloresearch"; +constexpr auto static kDefaultGHUserAgent = kMenloOrg; -constexpr auto static kWindowsOs = "windows"; +constexpr auto static kWindowsOs = "win"; constexpr auto static kMacOs = "mac"; constexpr auto static kLinuxOs = "linux"; +constexpr auto static kUbuntuOs = "ubuntu"; constexpr auto static kUnsupportedOs = "Unsupported OS"; constexpr auto static kCurlGetTimeout = 10; diff --git a/engine/utils/engine_matcher_utils.h b/engine/utils/engine_matcher_utils.h index 0b0cb26be..1afdd194c 100644 --- a/engine/utils/engine_matcher_utils.h +++ b/engine/utils/engine_matcher_utils.h @@ -7,6 +7,7 @@ #include #include #include "utils/cpuid/cpu_info.h" +#include "utils/engine_constants.h" #include "utils/logging_utils.h" #include "utils/result.hpp" #include "utils/string_utils.h" @@ -24,13 +25,19 @@ inline cpp::result GetVariantFromNameAndVersion( if (engine.empty()) { return cpp::fail("Engine name is empty"); } - auto nv = string_utils::RemoveSubstring(version, "v"); - using namespace string_utils; - auto removed_extension = RemoveSubstring(engine_file_name, ".tar.gz"); - auto version_and_variant = RemoveSubstring(removed_extension, engine + "-"); - - auto variant = RemoveSubstring(version_and_variant, nv + "-"); - return variant; + CTL_DBG("version: " << version); + namespace su = string_utils; + CTL_DBG("engine_file_name: " << engine_file_name); + auto rm_extension_menlo = su::RemoveSubstring(engine_file_name, ".tar.gz"); + auto rm_extension_ggml = su::RemoveSubstring(rm_extension_menlo, ".zip"); + CTL_DBG("removed_extension: " << rm_extension_ggml); + auto version_and_variant = + su::RemoveSubstring(rm_extension_ggml, engine + "-"); + CTL_DBG("version_and_variant: " << version_and_variant); + auto variant = su::RemoveSubstring(version_and_variant, version + "-"); + auto v = su::RemoveSubstring(variant, "llama-bin-"); + CTL_DBG("variant: " << v); + return v; } inline std::string GetSuitableAvxVariant(cortex::cpuid::CpuInfo& cpu_info) { @@ -48,7 +55,7 @@ inline std::string GetSuitableAvxVariant(cortex::cpuid::CpuInfo& cpu_info) { inline std::string GetSuitableCudaVariant( const std::vector& variants, const std::string& cuda_version) { - std::regex cuda_reg("cuda-(\\d+)-(\\d+)"); + std::regex cuda_reg("cuda-cu(\\d+).(\\d+)"); std::smatch match; int requested_major = 0; @@ -141,8 +148,9 @@ inline std::string Validate(const std::vector& variants, const std::string& os, const std::string& cpu_arch, const std::string& suitable_avx, const std::string& cuda_version) { + // CTL_INF(os << " " << cpu_arch); // Early return if the OS is not supported - if (os != "mac" && os != "windows" && os != "linux") { + if (os != kMacOs && os != kWindowsOs && os != kLinuxOs) { return ""; } @@ -150,6 +158,12 @@ inline std::string Validate(const std::vector& variants, std::copy_if(variants.begin(), variants.end(), std::back_inserter(os_and_arch_compatible_list), [&os, &cpu_arch](const std::string& variant) { + // In case of Linux, we need to include ubuntu version also + if (os == kLinuxOs) { + if (variant.find(kUbuntuOs) != std::string::npos && + variant.find(cpu_arch) != std::string::npos) + return true; + } auto os_match = "-" + os; auto cpu_arch_match = "-" + cpu_arch; @@ -157,10 +171,10 @@ inline std::string Validate(const std::vector& variants, variant.find(cpu_arch_match) != std::string::npos; }); - if (os == "mac" && !os_and_arch_compatible_list.empty()) + if (os == kMacOs && !os_and_arch_compatible_list.empty()) return os_and_arch_compatible_list[0]; - if (os == "linux" && cpu_arch == "arm64" && + if (os == kLinuxOs && cpu_arch == "arm64" && !os_and_arch_compatible_list.empty()) { return os_and_arch_compatible_list[0]; } @@ -170,7 +184,14 @@ inline std::string Validate(const std::vector& variants, std::copy_if(os_and_arch_compatible_list.begin(), os_and_arch_compatible_list.end(), std::back_inserter(avx_compatible_list), - [&suitable_avx](const std::string& variant) { + [&os, &cpu_arch, &suitable_avx](const std::string& variant) { + if (os == kLinuxOs && + (suitable_avx == "avx2" || suitable_avx == "avx512" || + cpu_arch == "arm64")) { + if (variant.find(std::string(kUbuntuOs) + "-" + cpu_arch) != + std::string::npos) + return true; + } auto suitable_avx_match = "-" + suitable_avx; return variant.find(suitable_avx_match) != std::string::npos; @@ -185,15 +206,18 @@ inline std::string Validate(const std::vector& variants, inline std::pair GetVersionAndArch( const std::string& file_name) { // Remove the file extension - std::string base = file_name.substr(0, file_name.find("tar") - 1); + std::string b = string_utils::RemoveSubstring(file_name, ".tar.gz"); + std::string base = string_utils::RemoveSubstring(b, ".zip"); size_t arch_pos = 0; - if (base.find("windows") != std::string::npos) { - arch_pos = base.find("-windows"); + if (base.find("win") != std::string::npos) { + arch_pos = base.find("-bin-win"); } else if (base.find("linux") != std::string::npos) { - arch_pos = base.find("-linux"); + arch_pos = base.find("-bin-linux"); + } else if (base.find("ubuntu") != std::string::npos) { + arch_pos = base.find("-bin-ubuntu"); } else { - arch_pos = base.find("-mac"); + arch_pos = base.find("-bin-macos"); } // Extract architecture part @@ -202,6 +226,6 @@ inline std::pair GetVersionAndArch( // Extract version part size_t v_pos = base.find_first_of('-'); auto version = base.substr(v_pos + 1, arch_pos - v_pos - 1); - return std::pair("v" + version, arch); + return std::pair(version, string_utils::RemoveSubstring(arch, "bin-")); } } // namespace engine_matcher_utils diff --git a/engine/utils/github_release_utils.h b/engine/utils/github_release_utils.h index 29f8a5725..84636903a 100644 --- a/engine/utils/github_release_utils.h +++ b/engine/utils/github_release_utils.h @@ -178,11 +178,6 @@ inline cpp::result GetReleaseByVersion( std::vector path_params{"repos", author, repo, "releases"}; if (tag != "latest") { path_params.push_back("tags"); - - if (!string_utils::StartsWith(tag, "v")) { - path_params.push_back("v" + tag); - } - path_params.push_back(tag); } else { path_params.push_back("latest"); diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index f63de5c5e..c9ccddfdf 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -347,7 +347,7 @@ bool KillProcess(ProcessInfo& proc_info) { bool success; #if defined(_WIN32) - success = TerminateJobObject(proc_info.hJob, 0) == 0; + success = TerminateJobObject(proc_info.hJob, 0); #elif defined(__APPLE__) || defined(__linux__) // we send SIGTERM to subprocess. we trust that this subprocess will // propagate SIGTERM correctly to its children processes. diff --git a/engine/utils/string_utils.h b/engine/utils/string_utils.h index a9ea756b3..e1a567942 100644 --- a/engine/utils/string_utils.h +++ b/engine/utils/string_utils.h @@ -22,6 +22,12 @@ inline std::string RTrim(const std::string& str) { return (end == std::string::npos) ? "" : str.substr(0, end + 1); } +inline void LTrim(std::string& s) { + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { + return !std::isspace(ch); + })); +}; + inline void Trim(std::string& s) { s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { return !std::isspace(ch); diff --git a/engine/utils/system_info_utils.h b/engine/utils/system_info_utils.h index 54eaed8c9..9bef6f4f9 100644 --- a/engine/utils/system_info_utils.h +++ b/engine/utils/system_info_utils.h @@ -70,7 +70,7 @@ inline std::unique_ptr GetSystemInfo() { #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || \ defined(__amd64) || defined(__x86_64) || defined(_M_AMD64) - arch << "amd64"; + arch << "x64"; #elif defined(__arm__) || defined(__arm) || defined(__arm64__) || \ defined(__aarch64__) || defined(__thumb__) || \ defined(__TARGET_ARCH_ARM) || defined(__TARGET_ARCH_THUMB) || \