diff --git a/.github/patches/windows/msvcp140.dll b/.github/patches/windows/msvcp140.dll index f999742d9..d3d103ee0 100644 Binary files a/.github/patches/windows/msvcp140.dll and b/.github/patches/windows/msvcp140.dll differ diff --git a/.github/patches/windows/vcruntime140.dll b/.github/patches/windows/vcruntime140.dll index 3a4aded20..8edab904f 100644 Binary files a/.github/patches/windows/vcruntime140.dll and b/.github/patches/windows/vcruntime140.dll differ diff --git a/.github/patches/windows/vcruntime140_1.dll b/.github/patches/windows/vcruntime140_1.dll index 3ebabdee6..2ef481dbf 100644 Binary files a/.github/patches/windows/vcruntime140_1.dll and b/.github/patches/windows/vcruntime140_1.dll differ diff --git a/.github/workflows/beta-build.yml b/.github/workflows/beta-build.yml index 1bf324d96..64d4e28e7 100644 --- a/.github/workflows/beta-build.yml +++ b/.github/workflows/beta-build.yml @@ -9,7 +9,7 @@ jobs: get-update-version: uses: ./.github/workflows/template-get-update-version.yml - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml create-draft-release: @@ -39,7 +39,7 @@ jobs: build-macos: uses: ./.github/workflows/template-build-macos.yml - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] secrets: inherit with: ref: ${{ github.ref }} @@ -48,12 +48,12 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" channel: beta upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-windows-x64: uses: ./.github/workflows/template-build-windows-x64.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -64,12 +64,12 @@ jobs: ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' channel: beta upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-linux-x64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -78,28 +78,28 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: beta upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: amd64 - build-linux-arm64: - uses: ./.github/workflows/template-build-linux.yml - secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] - with: - ref: ${{ github.ref }} - public_provider: github - new_version: ${{ needs.get-update-version.outputs.new_version }} - runs-on: ubuntu-2004-arm64 - cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" - channel: beta - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} - arch: arm64 + # build-linux-arm64: + # uses: ./.github/workflows/template-build-linux.yml + # secrets: inherit + # needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] + # with: + # ref: ${{ github.ref }} + # public_provider: github + # new_version: ${{ needs.get-update-version.outputs.new_version }} + # runs-on: ubuntu-2004-arm64 + # cmake-flags: "-DCORTEX_VARIANT=beta -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" + # channel: beta + # upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + # llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} + # arch: arm64 build-docker-x64: uses: ./.github/workflows/template-build-docker-x64.yml secrets: inherit - needs: [get-update-version, get-cortex-llamacpp-latest-version] + needs: [get-update-version, get-llamacpp-latest-version] with: ref: ${{ github.ref }} new_version: ${{ needs.get-update-version.outputs.new_version }} @@ -127,7 +127,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} noti-discord: - needs: [get-update-version, create-draft-release, build-macos, build-windows-x64, build-linux-x64, build-linux-arm64, update_release] + needs: [get-update-version, create-draft-release, build-macos, build-windows-x64, build-linux-x64, update_release] runs-on: ubuntu-latest permissions: contents: write diff --git a/.github/workflows/cortex-cpp-quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml index 279dd77d6..02774d159 100644 --- a/.github/workflows/cortex-cpp-quality-gate.yml +++ b/.github/workflows/cortex-cpp-quality-gate.yml @@ -21,12 +21,12 @@ jobs: fail-fast: false matrix: include: - - os: "linux" - name: "arm64" - runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" - build-deps-cmake-flags: "" - ccache-dir: "" + # - os: "linux" + # name: "arm64" + # runs-on: "ubuntu-2004-arm64" + # cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + # build-deps-cmake-flags: "" + # ccache-dir: "" - os: "linux" name: "amd64" runs-on: "ubuntu-20-04-cuda-12-0" @@ -150,6 +150,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" > ~/.config/cortexcpp/.cortexrc echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.config/cortexcpp/.cortexrc # ./build/cortex @@ -177,6 +178,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "apiServerPort: 3928" > ~/.config/cortexcpp/.cortexrc echo "huggingFaceToken: ${{ secrets.HUGGINGFACE_TOKEN_READ }}" >> ~/.config/cortexcpp/.cortexrc echo "gitHubToken: ${{ secrets.PAT_SERVICE_ACCOUNT }}" >> ~/.config/cortexcpp/.cortexrc @@ -352,12 +354,12 @@ jobs: fail-fast: false matrix: include: - - os: "linux" - name: "arm64" - runs-on: "ubuntu-2004-arm64" - cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" - build-deps-cmake-flags: "" - ccache-dir: "" + # - os: "linux" + # name: "arm64" + # runs-on: "ubuntu-2004-arm64" + # cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" + # build-deps-cmake-flags: "" + # ccache-dir: "" - os: "linux" name: "amd64" runs-on: "ubuntu-20-04-cuda-12-0" @@ -456,6 +458,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.config/cortexcpp/.cortexrc # ./build/cortex cat ~/.config/cortexcpp/.cortexrc @@ -481,6 +484,7 @@ jobs: run: | cd engine mkdir -p ~/.config/cortexcpp/ + mkdir -p ~/.local/share/cortexcpp/ echo "apiServerPort: 3928" > ~/.config/cortexcpp/.cortexrc echo "gitHubToken: ${{ secrets.GITHUB_TOKEN }}" > ~/.config/cortexcpp/.cortexrc # ./build/cortex diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml index 1f076dc97..f013a90e2 100644 --- a/.github/workflows/nightly-build.yml +++ b/.github/workflows/nightly-build.yml @@ -43,12 +43,12 @@ jobs: get-update-version: uses: ./.github/workflows/template-get-update-version.yml - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml build-macos: uses: ./.github/workflows/template-build-macos.yml - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] secrets: inherit with: ref: ${{ needs.set-public-provider.outputs.ref }} @@ -56,12 +56,12 @@ jobs: new_version: ${{ needs.get-update-version.outputs.new_version }} cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-windows-x64: uses: ./.github/workflows/template-build-windows-x64.yml secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] with: ref: ${{ needs.set-public-provider.outputs.ref }} public_provider: ${{ needs.set-public-provider.outputs.public_provider }} @@ -71,12 +71,12 @@ jobs: build-deps-cmake-flags: "-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -GNinja" ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-linux-x64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] with: ref: ${{ needs.set-public-provider.outputs.ref }} public_provider: ${{ needs.set-public-provider.outputs.public_provider }} @@ -84,27 +84,27 @@ jobs: runs-on: ubuntu-20-04 cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: amd64 - build-linux-arm64: - uses: ./.github/workflows/template-build-linux.yml - secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version] - with: - ref: ${{ needs.set-public-provider.outputs.ref }} - public_provider: ${{ needs.set-public-provider.outputs.public_provider }} - new_version: ${{ needs.get-update-version.outputs.new_version }} - runs-on: ubuntu-2004-arm64 - cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" - channel: nightly - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} - arch: arm64 + # build-linux-arm64: + # uses: ./.github/workflows/template-build-linux.yml + # secrets: inherit + # needs: [get-update-version, set-public-provider, get-llamacpp-latest-version] + # with: + # ref: ${{ needs.set-public-provider.outputs.ref }} + # public_provider: ${{ needs.set-public-provider.outputs.public_provider }} + # new_version: ${{ needs.get-update-version.outputs.new_version }} + # runs-on: ubuntu-2004-arm64 + # cmake-flags: "-DCORTEX_VARIANT=nightly -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" + # channel: nightly + # llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} + # arch: arm64 update-latest-version: runs-on: ubuntu-latest if: needs.set-public-provider.outputs.public_provider == 'aws-s3' - needs: [get-update-version, set-public-provider, build-linux-x64, build-linux-arm64, build-macos, build-windows-x64, get-cortex-llamacpp-latest-version] + needs: [get-update-version, set-public-provider, build-linux-x64, build-macos, build-windows-x64, get-llamacpp-latest-version] steps: - name: Update latest version id: update-latest-version @@ -132,7 +132,7 @@ jobs: if: needs.set-public-provider.outputs.public_provider == 'aws-s3' uses: ./.github/workflows/template-build-docker-x64.yml secrets: inherit - needs: [get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version] + needs: [get-update-version, set-public-provider, get-llamacpp-latest-version, update-latest-version] with: ref: ${{ needs.set-public-provider.outputs.ref }} new_version: nightly-${{ needs.get-update-version.outputs.new_version }} @@ -141,7 +141,7 @@ jobs: tags: menloltd/cortex:nightly-${{ needs.get-update-version.outputs.new_version }} noti-discord-nightly-and-update-url-readme: - needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, update-latest-version, build-docker-x64] + needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-llamacpp-latest-version, update-latest-version, build-docker-x64] secrets: inherit if: github.event_name == 'schedule' uses: ./.github/workflows/template-noti-discord.yaml @@ -150,7 +150,7 @@ jobs: new_version: ${{ needs.get-update-version.outputs.new_version }} noti-discord-manual: - needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-cortex-llamacpp-latest-version, build-docker-x64] + needs: [build-macos, build-windows-x64, build-linux-x64, get-update-version, set-public-provider, get-llamacpp-latest-version, build-docker-x64] secrets: inherit if: github.event_name == 'workflow_dispatch' && github.event.inputs.public_provider == 'aws-s3' uses: ./.github/workflows/template-noti-discord.yaml diff --git a/.github/workflows/stable-build.yml b/.github/workflows/stable-build.yml index b05df983d..27e05f9ce 100644 --- a/.github/workflows/stable-build.yml +++ b/.github/workflows/stable-build.yml @@ -9,7 +9,7 @@ jobs: get-update-version: uses: ./.github/workflows/template-get-update-version.yml - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: uses: ./.github/workflows/template-cortex-llamacpp-latest-version.yml create-draft-release: @@ -39,7 +39,7 @@ jobs: build-macos: uses: ./.github/workflows/template-build-macos.yml - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] secrets: inherit with: ref: ${{ github.ref }} @@ -48,12 +48,12 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake" channel: stable upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-windows-x64: uses: ./.github/workflows/template-build-windows-x64.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -64,12 +64,12 @@ jobs: ccache-dir: 'C:\Users\ContainerAdministrator\AppData\Local\ccache' channel: stable upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} build-linux-x64: uses: ./.github/workflows/template-build-linux.yml secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] + needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] with: ref: ${{ github.ref }} public_provider: github @@ -78,28 +78,28 @@ jobs: cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" channel: stable upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} + llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} arch: amd64 - build-linux-arm64: - uses: ./.github/workflows/template-build-linux.yml - secrets: inherit - needs: [get-update-version, create-draft-release, get-cortex-llamacpp-latest-version] - with: - ref: ${{ github.ref }} - public_provider: github - new_version: ${{ needs.get-update-version.outputs.new_version }} - runs-on: ubuntu-2004-arm64 - cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" - channel: stable - upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - cortex-llamacpp-version: ${{ needs.get-cortex-llamacpp-latest-version.outputs.cortex_llamacpp_latest_version }} - arch: arm64 + # build-linux-arm64: + # uses: ./.github/workflows/template-build-linux.yml + # secrets: inherit + # needs: [get-update-version, create-draft-release, get-llamacpp-latest-version] + # with: + # ref: ${{ github.ref }} + # public_provider: github + # new_version: ${{ needs.get-update-version.outputs.new_version }} + # runs-on: ubuntu-2004-arm64 + # cmake-flags: "-DCORTEX_VARIANT=prod -DCORTEX_CPP_VERSION='v${{ needs.get-update-version.outputs.new_version }}' -DCMAKE_TOOLCHAIN_FILE=/home/runner/actions-runner/_work/cortex.cpp/cortex.cpp/engine/vcpkg/scripts/buildsystems/vcpkg.cmake" + # channel: stable + # upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + # llamacpp-version: ${{ needs.get-llamacpp-latest-version.outputs.llamacpp_latest_version }} + # arch: arm64 build-docker-x64: uses: ./.github/workflows/template-build-docker-x64.yml secrets: inherit - needs: [get-update-version, get-cortex-llamacpp-latest-version] + needs: [get-update-version, get-llamacpp-latest-version] with: ref: ${{ github.ref }} new_version: ${{ needs.get-update-version.outputs.new_version }} diff --git a/.github/workflows/template-build-linux.yml b/.github/workflows/template-build-linux.yml index 3fa802ad4..0ebd04176 100644 --- a/.github/workflows/template-build-linux.yml +++ b/.github/workflows/template-build-linux.yml @@ -44,7 +44,7 @@ on: type: string default: 'nightly' description: 'The channel to use for this job' - cortex-llamacpp-version: + llamacpp-version: required: true type: string default: '0.0.0' @@ -169,23 +169,23 @@ jobs: mkdir -p engine/templates/linux/dependencies cd engine/templates/linux/dependencies if [ "${{ inputs.arch }}" == "amd64" ]; then - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-11-7.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx-cuda-12-0.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-11-7.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2-cuda-12-0.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx2.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-11-7.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512-cuda-12-0.tar.gz - # wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-avx512.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-11-7.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx-cuda-12-0.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-noavx.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-amd64-vulkan.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-linux-amd64.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-linux-amd64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-cuda-cu11.7-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-cuda-cu12.0-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx2-cuda-cu11.7-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx2-cuda-cu12.0-x64.tar.gz + wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-x64.zip + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-cuda-cu11.7-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-cuda-cu12.0-x64.tar.gz + # wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-avx512-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-cuda-cu11.7-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-cuda-cu12.0-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-linux-noavx-x64.tar.gz + wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-vulkan-x64.zip + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-linux-cu11.7-x64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu12.0-x64.tar.gz else - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-linux-arm64.tar.gz + wget https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-ubuntu-arm64.zip fi cd .. diff --git a/.github/workflows/template-build-macos.yml b/.github/workflows/template-build-macos.yml index 20c7430fb..ea96d2df6 100644 --- a/.github/workflows/template-build-macos.yml +++ b/.github/workflows/template-build-macos.yml @@ -39,7 +39,7 @@ on: type: string default: 'nightly' description: 'The channel to use for this job' - cortex-llamacpp-version: + llamacpp-version: required: true type: string default: '0.0.0' @@ -253,6 +253,14 @@ jobs: cd engine make codesign-binary CODE_SIGN=true DEVELOPER_ID="${{ secrets.DEVELOPER_ID }}" DESTINATION_BINARY_NAME="${{ steps.set-output-params.outputs.destination_binary_name }}" DESTINATION_BINARY_SERVER_NAME="${{ steps.set-output-params.outputs.destination_binary_server_name }}" + - name: Code Signing binaries for separate binary + run: | + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_name }} + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_name }} + codesign --force -s "${{ secrets.DEVELOPER_ID }}" --options=runtime --entitlements="./engine/templates/macos/entitlements.plist" ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + - name: Notary macOS Binary run: | curl -sSfL https://raw.githubusercontent.com/anchore/quill/main/install.sh | sh -s -- -b /usr/local/bin @@ -265,6 +273,18 @@ jobs: QUILL_NOTARY_ISSUER: ${{ secrets.NOTARY_ISSUER }} QUILL_NOTARY_KEY: "/tmp/notary-key.p8" + - name: Notary macOS Binary for separate binary + run: | + # Notarize the binary + quill notarize ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_name }} + quill notarize ./cortex-${{ inputs.new_version }}-mac-arm64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + quill notarize ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_name }} + quill notarize ./cortex-${{ inputs.new_version }}-mac-amd64/${{ steps.set-output-params.outputs.destination_binary_server_name }} + env: + QUILL_NOTARY_KEY_ID: ${{ secrets.NOTARY_KEY_ID }} + QUILL_NOTARY_ISSUER: ${{ secrets.NOTARY_ISSUER }} + QUILL_NOTARY_KEY: "/tmp/notary-key.p8" + - name: Build network Installers shell: bash run: | @@ -289,8 +309,8 @@ jobs: run: | mkdir -p engine/templates/macos/Scripts/dependencies cd engine/templates/macos/Scripts/dependencies - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-arm64.tar.gz - wget https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-mac-amd64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-macos-arm64.tar.gz + wget https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-macos-x64.tar.gz cd ../../ chmod +x create_pkg_local.sh @@ -310,6 +330,24 @@ jobs: xcrun notarytool submit ${{ steps.set-output-params.outputs.package_name }}-local.pkg --apple-id ${{ secrets.APPLE_ID }} --password ${{ secrets.APPLE_APP_SPECIFIC_PASSWORD }} --team-id ${{ secrets.APPLE_TEAM_ID }} --wait - name: Package + run: | + mkdir temp + # Mac arm64 + mv cortex-${{ inputs.new_version }}-mac-arm64 temp/cortex + cd temp + tar -czvf cortex-arm64.tar.gz cortex + mv cortex-arm64.tar.gz ../cortex-arm64.tar.gz + cd .. + rm -rf temp/cortex + + # Mac amd64 + mv cortex-${{ inputs.new_version }}-mac-amd64 temp/cortex + cd temp + tar -czvf cortex-amd64.tar.gz cortex + mv cortex-amd64.tar.gz ../cortex-amd64.tar.gz + cd .. + + - name: Package for separate binary run: | cd engine make package @@ -320,6 +358,18 @@ jobs: name: cortex-${{ inputs.new_version }}-mac-universal path: ./engine/cortex + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-arm64-signed + path: ./cortex-${{ inputs.new_version }}-mac-arm64 + + - name: Upload Artifact + uses: actions/upload-artifact@v4 + with: + name: cortex-${{ inputs.new_version }}-mac-amd64-signed + path: ./cortex-${{ inputs.new_version }}-mac-amd64 + - name: Upload Artifact uses: actions/upload-artifact@v4 with: @@ -358,6 +408,28 @@ jobs: asset_name: cortex-${{ inputs.new_version }}-mac-universal.tar.gz asset_content_type: application/zip + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./cortex-arm64.tar.gz + asset_name: cortex-${{ inputs.new_version }}-mac-arm64.tar.gz + asset_content_type: application/zip + + - name: Upload release assert if public provider is github + if: inputs.public_provider == 'github' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + uses: actions/upload-release-asset@v1.0.1 + with: + upload_url: ${{ inputs.upload_url }} + asset_path: ./cortex-amd64.tar.gz + asset_name: cortex-${{ inputs.new_version }}-mac-amd64.tar.gz + asset_content_type: application/zip + - name: Upload release assert if public provider is github if: inputs.public_provider == 'github' env: diff --git a/.github/workflows/template-build-windows-x64.yml b/.github/workflows/template-build-windows-x64.yml index b9e0c9937..399e3dd3e 100644 --- a/.github/workflows/template-build-windows-x64.yml +++ b/.github/workflows/template-build-windows-x64.yml @@ -44,7 +44,7 @@ on: type: string default: 'nightly' description: 'The channel to use for this job' - cortex-llamacpp-version: + llamacpp-version: required: true type: string default: '0.0.0' @@ -205,21 +205,21 @@ jobs: run: | mkdir dependencies cd dependencies - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-11-7.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx-cuda-12-0.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-11-7.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2-cuda-12-0.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx2.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-11-7.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512-cuda-12-0.tar.gz - # wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-avx512.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-11-7.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx-cuda-12-0.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-noavx.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cortex.llamacpp-${{ inputs.cortex-llamacpp-version }}-windows-amd64-vulkan.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-11-7-windows-amd64.tar.gz - wget.exe https://github.com/menloresearch/cortex.llamacpp/releases/download/v${{ inputs.cortex-llamacpp-version }}/cuda-12-0-windows-amd64.tar.gz + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-cuda-cu11.7-x64.tar.gz + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-cuda-cu12.0-x64.tar.gz + # wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx-x64.zip + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-cuda-cu11.7-x64.tar.gz + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-cuda-cu12.0-x64.tar.gz + wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx2-x64.zip + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-cuda-cu11.7-x64.tar.gz + # wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-cuda-cu12.0-x64.tar.gz + # wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-avx512-x64.zip + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-cuda-cu11.7-x64.tar.gz + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-cuda-cu12.0-x64.tar.gz + wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-noavx-x64.zip + wget.exe https://github.com/ggml-org/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/llama-${{ inputs.llamacpp-version }}-bin-win-vulkan-x64.zip + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu11.7-x64.tar.gz + wget.exe https://github.com/menloresearch/llama.cpp/releases/download/${{ inputs.llamacpp-version }}/cudart-llama-bin-win-cu12.0-x64.tar.gz - name: Enable long paths run: | diff --git a/.github/workflows/template-cortex-llamacpp-latest-version.yml b/.github/workflows/template-cortex-llamacpp-latest-version.yml index 610b1a89a..3d7b74e56 100644 --- a/.github/workflows/template-cortex-llamacpp-latest-version.yml +++ b/.github/workflows/template-cortex-llamacpp-latest-version.yml @@ -1,13 +1,13 @@ -name: get-cortex-llamacpp-latest-version +name: get-llamacpp-latest-version on: workflow_call: outputs: - cortex_llamacpp_latest_version: + llamacpp_latest_version: description: 'The latest version of cortex.llamacpp engines' - value: ${{ jobs.get-cortex-llamacpp-latest-version.outputs.new_version }} + value: ${{ jobs.get-llamacpp-latest-version.outputs.new_version }} jobs: - get-cortex-llamacpp-latest-version: + get-llamacpp-latest-version: runs-on: ubuntu-latest outputs: new_version: ${{ steps.version_update.outputs.new_version }} @@ -24,7 +24,7 @@ jobs: local max_retries=3 local tag while [ $retries -lt $max_retries ]; do - tag=$(curl -s https://api.github.com/repos/menloresearch/cortex.llamacpp/releases/latest | jq -r .tag_name) + tag=$(curl -s https://api.github.com/repos/menloresearch/llama.cpp/releases/latest | jq -r .tag_name) if [ -n "$tag" ] && [ "$tag" != "null" ]; then echo $tag return diff --git a/README.md b/README.md index 5cd51ece1..f56842d29 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,10 @@ +
+

🚨 Archived Repository Notice

+

This repository is no longer actively maintained.

+

Development has moved to menloresearch/llama.cpp.

+

Please contribute directly to llama.cpp moving forward.

+
+ # Cortex

diff --git a/docker/Dockerfile b/docker/Dockerfile index 744c3899c..5f04da12e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -24,7 +24,6 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ apt-get update && \ apt-get install -y --no-install-recommends \ - cmake \ make \ git \ uuid-dev \ @@ -37,11 +36,21 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul ninja-build \ pkg-config \ python3-pip \ - openssl && \ + openssl \ + libssl-dev && \ pip3 install awscli && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Download and install CMake 3.22.6 +RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.6/cmake-3.22.6.tar.gz -q -O /tmp/cmake.tar.gz && \ + tar -xzf /tmp/cmake.tar.gz -C /tmp && \ + cd /tmp/cmake-3.22.6 && \ + ./bootstrap && \ + make -j$(nproc) && \ + make install && \ + rm -rf /tmp/cmake.tar.gz /tmp/cmake-3.22.6 + ARG CORTEX_CPP_VERSION=latest ARG CMAKE_EXTRA_FLAGS="" diff --git a/docker/Dockerfile.cache b/docker/Dockerfile.cache index 0a9cbe02d..3eabc5dce 100644 --- a/docker/Dockerfile.cache +++ b/docker/Dockerfile.cache @@ -24,7 +24,6 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ apt-get update && \ apt-get install -y --no-install-recommends \ - cmake \ make \ git \ uuid-dev \ @@ -37,11 +36,21 @@ RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/nul ninja-build \ pkg-config \ python3-pip \ - openssl && \ + openssl \ + libssl-dev && \ pip3 install awscli && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Download and install CMake 3.22.6 +RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.6/cmake-3.22.6.tar.gz -q -O /tmp/cmake.tar.gz && \ + tar -xzf /tmp/cmake.tar.gz -C /tmp && \ + cd /tmp/cmake-3.22.6 && \ + ./bootstrap && \ + make -j$(nproc) && \ + make install && \ + rm -rf /tmp/cmake.tar.gz /tmp/cmake-3.22.6 + ARG CORTEX_CPP_VERSION=latest ARG CMAKE_EXTRA_FLAGS="" diff --git a/docs/docs/engines/engine-extension.mdx b/docs/docs/engines/engine-extension.mdx index d2edde830..8b550c5a4 100644 --- a/docs/docs/engines/engine-extension.mdx +++ b/docs/docs/engines/engine-extension.mdx @@ -71,9 +71,6 @@ class EngineI { std::shared_ptr json_body, std::function&& callback) = 0; - // Compatibility and model management - virtual bool IsSupported(const std::string& f) = 0; - virtual void GetModels( std::shared_ptr jsonBody, std::function&& callback) = 0; diff --git a/docs/docs/guides/function-calling.md b/docs/docs/guides/function-calling.md index 6b9157f18..7725f225d 100644 --- a/docs/docs/guides/function-calling.md +++ b/docs/docs/guides/function-calling.md @@ -63,8 +63,14 @@ tools = [ completion_payload = { "messages": [ - {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."}, - {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"}, + { + "role": "system", + "content": 'You have access to the following CUSTOM functions:\n\n\n\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => ` a JSON dict with the function argument name as key and function argument value as value.\nend_tag => ``\n\nHere is an example,\n{"example_name": "example_value"}\n\nReminder:\n- Function calls MUST follow the specified format\n- Required parameters MUST be specified\n- You can call one or more functions at a time, but remember only chose correct function\n- Put the entire function call reply on one line\n- Always add your sources when using search results to answer the user query\n- If you can not find correct parameters or arguments corresponding to function in the user\'s message, ask user again to provide, do not make assumptions.\n- No explanation are needed when calling a function.\n\nYou are a helpful assistant.', + }, + { + "role": "user", + "content": "Hi, can you tell me the delivery date for my order?" + }, ] } @@ -126,10 +132,22 @@ Once the user provides their order ID: ```python completion_payload = { "messages": [ - {"role": "system", "content": "You are a helpful customer support assistant. Use the supplied tools to assist the user."}, - {"role": "user", "content": "Hi, can you tell me the delivery date for my order?"}, - {"role": "assistant", "content": "Of course! Please provide your order ID so I can look it up."}, - {"role": "user", "content": "i think it is order_70705"}, + { + "role": "system", + "content": 'You have access to the following CUSTOM functions:\n\n\n\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => ` a JSON dict with the function argument name as key and function argument value as value.\nend_tag => ``\n\nHere is an example,\n{"example_name": "example_value"}\n\nReminder:\n- Function calls MUST follow the specified format\n- Required parameters MUST be specified\n- You can call one or more functions at a time, but remember only chose correct function\n- Put the entire function call reply on one line\n- Always add your sources when using search results to answer the user query\n- If you can not find correct parameters or arguments corresponding to function in the user\'s message, ask user again to provide, do not make assumptions.\n- No explanation are needed when calling a function.\n\nYou are a helpful assistant.', + }, + { + "role": "user", + "content": "Hi, can you tell me the delivery date for my order?" + }, + { + "role": "assistant", + "content": "Of course! Please provide your order ID so I can look it up." + }, + { + "role": "user", + "content": "i think it is order_70705" + }, ] } diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json index 23970ef51..b7d628094 100644 --- a/docs/static/openapi/cortex.json +++ b/docs/static/openapi/cortex.json @@ -2754,7 +2754,7 @@ }, "version": { "type": "string", - "example": "0.1.35-28.10.24" + "example": "b4920" } } } @@ -2763,11 +2763,11 @@ { "engine": "llama-cpp", "name": "mac-arm64", - "version": "0.1.35-28.10.24" + "version": "b4920" }, { "engine": "llama-cpp", - "name": "linux-amd64-avx", + "name": "linux-avx-x64", "version": "0.1.35-27.10.24" } ] @@ -2901,7 +2901,7 @@ "name": { "type": "string", "description": "The name of the variant, including OS, architecture, and capabilities", - "example": "linux-amd64-avx-cuda-11-7" + "example": "linux-avx-x64-cuda-11-7" }, "created_at": { "type": "string", @@ -2973,7 +2973,7 @@ }, "name": { "type": "string", - "example": "0.1.39-linux-amd64-avx-cuda-11-7" + "example": "llama-b4920-bin-linux-avx-cuda-cu11.7" }, "size": { "type": "integer", @@ -3250,7 +3250,7 @@ }, "version": { "type": "string", - "example": "0.1.35-28.10.24" + "example": "b4920" } } } diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt index f7a20b58b..39052b08e 100644 --- a/engine/CMakeLists.txt +++ b/engine/CMakeLists.txt @@ -182,6 +182,7 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/extensions/remote-engine/remote_engine.cc + ${CMAKE_CURRENT_SOURCE_DIR}/extensions/local-engine/local_engine.cc ) @@ -227,3 +228,12 @@ set_target_properties(${TARGET_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR} RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} ) + +if(MSVC) + add_custom_command( + TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_CURRENT_SOURCE_DIR}/../.github/patches/windows + ${CMAKE_BINARY_DIR}/ + ) +endif() \ No newline at end of file diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt index 4163042d0..bb18433fe 100644 --- a/engine/cli/CMakeLists.txt +++ b/engine/cli/CMakeLists.txt @@ -73,7 +73,7 @@ add_executable(${TARGET_NAME} main.cc ${CMAKE_CURRENT_SOURCE_DIR}/../services/hardware_service.cc ${CMAKE_CURRENT_SOURCE_DIR}/../services/database_service.cc ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/remote-engine/remote_engine.cc - + ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/local-engine/local_engine.cc ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/template_renderer.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/easywsclient.cc diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc index 99f51983e..aa0b9aab4 100644 --- a/engine/cli/command_line_parser.cc +++ b/engine/cli/command_line_parser.cc @@ -33,6 +33,7 @@ #include "services/engine_service.h" #include "utils/file_manager_utils.h" #include "utils/logging_utils.h" +#include "utils/task_queue.h" namespace { constexpr const auto kCommonCommandsGroup = "Common Commands"; @@ -50,8 +51,7 @@ CommandLineParser::CommandLineParser() download_service_{std::make_shared()}, dylib_path_manager_{std::make_shared()}, db_service_{std::make_shared()}, - engine_service_{std::make_shared( - download_service_, dylib_path_manager_, db_service_)} {} + engine_service_{std::make_shared(dylib_path_manager_)} {} bool CommandLineParser::SetupCommand(int argc, char** argv) { app_.usage("Usage:\n" + commands::GetCortexBinary() + diff --git a/engine/cli/commands/cortex_upd_cmd.cc b/engine/cli/commands/cortex_upd_cmd.cc index e11ad4290..33a51ed53 100644 --- a/engine/cli/commands/cortex_upd_cmd.cc +++ b/engine/cli/commands/cortex_upd_cmd.cc @@ -532,10 +532,10 @@ bool CortexUpdCmd::GetLinuxInstallScript(const std::string& v, const std::string& channel) { std::vector path_list; if (channel == "nightly") { - path_list = {"menloresearch", "cortex.cpp", "dev", "engine", + path_list = {kMenloOrg, "cortex.cpp", "dev", "engine", "templates", "linux", "install.sh"}; } else { - path_list = {"menloresearch", "cortex.cpp", "main", "engine", + path_list = {kMenloOrg, "cortex.cpp", "main", "engine", "templates", "linux", "install.sh"}; } auto url_obj = url_parser::Url{ diff --git a/engine/cli/commands/cortex_upd_cmd.h b/engine/cli/commands/cortex_upd_cmd.h index 7f02839cf..fdee6cc49 100644 --- a/engine/cli/commands/cortex_upd_cmd.h +++ b/engine/cli/commands/cortex_upd_cmd.h @@ -79,9 +79,9 @@ inline std::vector GetReleasePath() { if (CORTEX_VARIANT == file_manager_utils::kNightlyVariant) { return {"cortex", "latest", "version.json"}; } else if (CORTEX_VARIANT == file_manager_utils::kBetaVariant) { - return {"repos", "menloresearch", "cortex.cpp", "releases"}; + return {"repos", kMenloOrg, "cortex.cpp", "releases"}; } else { - return {"repos", "menloresearch", "cortex.cpp", "releases", "latest"}; + return {"repos", kMenloOrg, "cortex.cpp", "releases", "latest"}; } } diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc index bebfdb8ce..b31aecaa6 100644 --- a/engine/cli/commands/engine_install_cmd.cc +++ b/engine/cli/commands/engine_install_cmd.cc @@ -92,7 +92,10 @@ bool EngineInstallCmd::Exec(const std::string& engine, std::vector variant_selections; for (const auto& variant : variant_result.value()) { auto v_name = variant["name"].asString(); - if (string_utils::StringContainsIgnoreCase(v_name, hw_inf_.sys_inf->os) && + if ((string_utils::StringContainsIgnoreCase(v_name, + hw_inf_.sys_inf->os) || + (hw_inf_.sys_inf->os == kLinuxOs && + string_utils::StringContainsIgnoreCase(v_name, kUbuntuOs))) && string_utils::StringContainsIgnoreCase(v_name, hw_inf_.sys_inf->arch)) { variant_selections.push_back(variant["name"].asString()); diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc index af2d647e2..e074ee18a 100644 --- a/engine/cli/commands/server_start_cmd.cc +++ b/engine/cli/commands/server_start_cmd.cc @@ -106,10 +106,8 @@ bool ServerStartCmd::Exec(const std::string& host, int port, #else std::vector commands; // Some engines requires to add lib search path before process being created - auto download_srv = std::make_shared(); - auto dylib_path_mng = std::make_shared(); - auto db_srv = std::make_shared(); - EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath(); + EngineService(std::make_shared()) + .RegisterEngineLibPath(); std::string p = cortex_utils::GetCurrentPath() + "/" + exe; commands.push_back(p); diff --git a/engine/cli/main.cc b/engine/cli/main.cc index a4e6c38cc..1fa45d6fd 100644 --- a/engine/cli/main.cc +++ b/engine/cli/main.cc @@ -155,7 +155,7 @@ int main(int argc, char* argv[]) { auto get_latest_version = []() -> cpp::result { try { auto res = github_release_utils::GetReleaseByVersion( - "menloresearch", "cortex.llamacpp", "latest"); + kGgmlOrg, kLlamaRepo, "latest"); if (res.has_error()) { CTL_ERR("Failed to get latest llama.cpp version: " << res.error()); return cpp::fail("Failed to get latest llama.cpp version: " + diff --git a/engine/cli/utils/download_progress.cc b/engine/cli/utils/download_progress.cc index 7538fff46..32cc6e20a 100644 --- a/engine/cli/utils/download_progress.cc +++ b/engine/cli/utils/download_progress.cc @@ -83,8 +83,8 @@ bool DownloadProgress::Handle( size_t max_length = 20) -> std::string { // Check the length of the input string if (str.length() >= max_length) { - return str.substr( - 0, max_length); // Return truncated string if it's too long + return str.substr(0, max_length - 3) + + ".. "; // Return truncated string if it's too long } // Calculate the number of spaces needed diff --git a/engine/config/yaml_config.cc b/engine/config/yaml_config.cc index 9650ffdcc..38128e1c4 100644 --- a/engine/config/yaml_config.cc +++ b/engine/config/yaml_config.cc @@ -48,7 +48,7 @@ void YamlHandler::ReadYamlFile(const std::string& file_path) { if (!yaml_node_["mmproj"]) { auto s = nomalize_path(file_path); auto abs_path = s.substr(0, s.find_last_of('/')) + "/mmproj.gguf"; - CTL_DBG("mmproj: " << abs_path); + CTL_TRC("mmproj: " << abs_path); auto rel_path = fmu::ToRelativeCortexDataPath(fs::path(abs_path)); if (std::filesystem::exists(abs_path)) { yaml_node_["mmproj"] = rel_path.string(); diff --git a/engine/controllers/engines.cc b/engine/controllers/engines.cc index f7deb41eb..2a9427abf 100644 --- a/engine/controllers/engines.cc +++ b/engine/controllers/engines.cc @@ -155,6 +155,7 @@ void Engines::GetEngineVariants( releases.append(json.value()); } } + CTL_INF(releases.toStyledString()); auto resp = cortex_utils::CreateCortexHttpJsonResponse(releases); resp->setStatusCode(k200OK); callback(resp); @@ -177,6 +178,8 @@ void Engines::InstallEngine( } norm_version = version; } + CTL_INF("version: " << norm_version + << ", norm_variant: " << norm_variant.value_or("")); auto result = engine_service_->InstallEngineAsync(engine, norm_version, norm_variant); diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc index 079b69423..3ba4aa327 100644 --- a/engine/controllers/server.cc +++ b/engine/controllers/server.cc @@ -138,7 +138,7 @@ void server::ProcessStreamRes(std::function cb, auto err_or_done = std::make_shared(false); auto chunked_content_provider = [this, q, err_or_done, engine_type, model_id]( char* buf, - std::size_t buf_size) -> std::size_t { + std::size_t buf_size) -> std::size_t { if (buf == nullptr) { LOG_TRACE << "Buf is null"; if (!(*err_or_done)) { @@ -179,7 +179,6 @@ void server::ProcessStreamRes(std::function cb, void server::ProcessNonStreamRes(std::function cb, SyncQueue& q) { auto [status, res] = q.wait_and_pop(); - function_calling_utils::PostProcessResponse(res); LOG_DEBUG << "response: " << res.toStyledString(); auto resp = cortex_utils::CreateCortexHttpJsonResponse(res); resp->setStatusCode( diff --git a/engine/cortex-common/EngineI.h b/engine/cortex-common/EngineI.h index b796ebaed..2518b0ce5 100644 --- a/engine/cortex-common/EngineI.h +++ b/engine/cortex-common/EngineI.h @@ -47,9 +47,6 @@ class EngineI { std::shared_ptr json_body, std::function&& callback) = 0; - // For backward compatible checking - virtual bool IsSupported(const std::string& f) = 0; - // Get list of running models virtual void GetModels( std::shared_ptr jsonBody, diff --git a/engine/cortex-common/remote_enginei.h b/engine/cortex-common/remote_enginei.h index 835f526a0..163490cdc 100644 --- a/engine/cortex-common/remote_enginei.h +++ b/engine/cortex-common/remote_enginei.h @@ -1,7 +1,5 @@ #pragma once -#pragma once - #include #include diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py index 7356ef904..842ef2c35 100644 --- a/engine/e2e-test/api/engines/test_api_engine.py +++ b/engine/e2e-test/api/engines/test_api_engine.py @@ -28,14 +28,14 @@ def test_engines_get_llamacpp_should_be_successful(self): # engines install def test_engines_install_llamacpp_specific_version_and_variant(self): - data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx"} + data = {"version": "b4932", "variant": "linux-avx-x64"} response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) assert response.status_code == 200 def test_engines_install_llamacpp_specific_version_and_null_variant(self): - data = {"version": "v0.1.40-b4354"} + data = {"version": "b4932"} response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) @@ -55,14 +55,14 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self): @pytest.mark.asyncio async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self): # install first - data = {"variant": "mac-arm64"} + data = {"variant": "linux-avx-x64"} install_response = requests.post( "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data ) await wait_for_websocket_download_success_event(timeout=120) assert install_response.status_code == 200 - data = {"version": "v0.1.35"} + data = {"version": "b4932"} response = requests.delete( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) @@ -72,7 +72,7 @@ async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_fa @pytest.mark.asyncio async def test_engines_install_uninstall_llamacpp_with_variant_should_be_successful(self): # install first - data = {"variant": "mac-arm64"} + data = {"variant": "linux-avx-x64"} install_response = requests.post( "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data ) @@ -85,7 +85,7 @@ async def test_engines_install_uninstall_llamacpp_with_variant_should_be_success def test_engines_install_uninstall_llamacpp_with_specific_variant_and_version_should_be_successful( self, ): - data = {"variant": "mac-arm64", "version": "v0.1.35"} + data = {"variant": "linux-avx-x64", "version": "b4932"} # install first install_response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py index e92afb14b..088cc2474 100644 --- a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py +++ b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py @@ -2,7 +2,7 @@ import requests from utils.test_runner import start_server, stop_server, get_latest_pre_release_tag -latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "cortex.llamacpp") +latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "llama.cpp") class TestApiEngineInstall: @@ -23,7 +23,7 @@ def test_engines_install_llamacpp_should_be_successful(self): assert response.status_code == 200 def test_engines_install_llamacpp_specific_version_and_variant(self): - data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx"} + data = {"version": latest_pre_release_tag, "variant": "linux-avx-x64"} response = requests.post( "http://localhost:3928/v1/engines/llama-cpp/install", json=data ) diff --git a/engine/e2e-test/api/engines/test_api_get_default_engine.py b/engine/e2e-test/api/engines/test_api_get_default_engine.py index 2dfc467a3..f0566128c 100644 --- a/engine/e2e-test/api/engines/test_api_get_default_engine.py +++ b/engine/e2e-test/api/engines/test_api_get_default_engine.py @@ -24,8 +24,8 @@ def setup_and_teardown(self): def test_api_get_default_engine_successfully(self): # Data test engine= "llama-cpp" - name= "linux-amd64-avx" - version= "v0.1.35-27.10.24" + name= "linux-avx-x64" + version= "b4932" data = {"version": version, "variant": name} post_install_url = f"http://localhost:3928/v1/engines/{engine}/install" diff --git a/engine/e2e-test/api/engines/test_api_get_list_engine.py b/engine/e2e-test/api/engines/test_api_get_list_engine.py index e6baa22a6..38cb45b39 100644 --- a/engine/e2e-test/api/engines/test_api_get_list_engine.py +++ b/engine/e2e-test/api/engines/test_api_get_list_engine.py @@ -24,8 +24,8 @@ def setup_and_teardown(self): def test_api_get_list_engines_successfully(self): # Data test engine= "llama-cpp" - name= "linux-amd64-avx" - version= "v0.1.35-27.10.24" + name= "linux-avx-x64" + version= "b4932" post_install_url = f"http://localhost:3928/v1/engines/{engine}/install" response = requests.delete( diff --git a/engine/e2e-test/api/engines/test_api_post_default_engine.py b/engine/e2e-test/api/engines/test_api_post_default_engine.py index b2b4e4c48..cede78485 100644 --- a/engine/e2e-test/api/engines/test_api_post_default_engine.py +++ b/engine/e2e-test/api/engines/test_api_post_default_engine.py @@ -23,8 +23,8 @@ def setup_and_teardown(self): def test_api_set_default_engine_successfully(self): # Data test engine= "llama-cpp" - name= "linux-amd64-avx" - version= "v0.1.35-27.10.24" + name= "linux-avx-x64" + version= "b4932" data = {"version": version, "variant": name} post_install_url = f"http://localhost:3928/v1/engines/{engine}/install" diff --git a/engine/e2e-test/api/files/test_api_create_file.py b/engine/e2e-test/api/files/test_api_create_file.py index 7c7226f50..03525672d 100644 --- a/engine/e2e-test/api/files/test_api_create_file.py +++ b/engine/e2e-test/api/files/test_api_create_file.py @@ -23,7 +23,6 @@ def setup_and_teardown(self): # Teardown stop_server() - @pytest.mark.skipif(platform.system() != "Linux", reason="Todo: fix later on Mac and Window") def test_api_create_file_successfully(self): # Define file path file_path_rel = os.path.join("e2e-test", "api", "files", "blank.txt") diff --git a/engine/e2e-test/api/hardware/test_api_get_hardware.py b/engine/e2e-test/api/hardware/test_api_get_hardware.py index 59b15ac18..0efecdbdc 100644 --- a/engine/e2e-test/api/hardware/test_api_get_hardware.py +++ b/engine/e2e-test/api/hardware/test_api_get_hardware.py @@ -88,25 +88,6 @@ def test_api_get_hardware_successfully(self): "example": True, "description": "Indicates if the GPU is currently activated." }, - "additional_information": { - "type": "object", - "properties": { - "compute_cap": { - "type": "string", - "example": "8.6", - "description": "The compute capability of the GPU." - }, - "driver_version": { - "type": "string", - "example": "535.183", - "description": "The version of the installed driver." - } - }, - "required": [ - "compute_cap", - "driver_version" - ] - }, "free_vram": { "type": "integer", "example": 23983, @@ -140,7 +121,6 @@ def test_api_get_hardware_successfully(self): }, "required": [ "activated", - "additional_information", "free_vram", "id", "name", diff --git a/engine/e2e-test/api/model/test_api_model.py b/engine/e2e-test/api/model/test_api_model.py index bacf7e1b0..f370b1daa 100644 --- a/engine/e2e-test/api/model/test_api_model.py +++ b/engine/e2e-test/api/model/test_api_model.py @@ -1,6 +1,7 @@ import pytest import requests import time +import platform from utils.test_runner import ( run, start_server, @@ -95,6 +96,7 @@ async def test_models_start_stop_should_be_successful(self): time.sleep(30) print("Pull model") + requests.delete("http://localhost:3928/v1/models/tinyllama:1b") json_body = {"model": "tinyllama:1b"} response = requests.post("http://localhost:3928/v1/models/pull", json=json_body) assert response.status_code == 200, f"Failed to pull model: tinyllama:1b" @@ -110,16 +112,18 @@ async def test_models_start_stop_should_be_successful(self): response = requests.get("http://localhost:3928/v1/models") assert response.status_code == 200 - print("Start model") - json_body = {"model": "tinyllama:1b"} - response = requests.post( - "http://localhost:3928/v1/models/start", json=json_body - ) - assert response.status_code == 200, f"status_code: {response.status_code}" + # Skip tests for linux arm + if platform.machine() != "aarch64": + print("Start model") + json_body = {"model": "tinyllama:1b"} + response = requests.post( + "http://localhost:3928/v1/models/start", json=json_body + ) + assert response.status_code == 200, f"status_code: {response.status_code}" - print("Stop model") - response = requests.post("http://localhost:3928/v1/models/stop", json=json_body) - assert response.status_code == 200, f"status_code: {response.status_code}" + print("Stop model") + response = requests.post("http://localhost:3928/v1/models/stop", json=json_body) + assert response.status_code == 200, f"status_code: {response.status_code}" # update API print("Update model") diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py index 370ebe3f3..5d520ce8b 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_install.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_install.py @@ -31,25 +31,9 @@ def test_engines_install_llamacpp_should_be_successfully(self): assert len(response.json()) > 0 assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(reason="Ignore onnx-runtime test") - def test_engines_install_onnx_on_macos_should_be_failed(self): - exit_code, output, error = run( - "Install Engine", ["engines", "install", "onnxruntime"] - ) - assert "is not supported on" in output, "Should display error message" - assert exit_code == 0, f"Install engine failed with error: {error}" - - @pytest.mark.skipif(reason="Ignore tensorrt-llm test") - def test_engines_install_onnx_on_tensorrt_should_be_failed(self): - exit_code, output, error = run( - "Install Engine", ["engines", "install", "tensorrt-llm"] - ) - assert "is not supported on" in output, "Should display error message" - assert exit_code == 0, f"Install engine failed with error: {error}" - @pytest.mark.skipif(platform.system() == "Windows", reason="Progress bar log issue on Windows") def test_engines_install_pre_release_llamacpp(self): - engine_version = "v0.1.43" + engine_version = "b4932" exit_code, output, error = run( "Install Engine", ["engines", "install", "llama-cpp", "-v", engine_version], diff --git a/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py b/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py index 8672110e2..3198c81a5 100644 --- a/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py +++ b/engine/e2e-test/cli/engines/test_cli_engine_uninstall.py @@ -24,7 +24,10 @@ def setup_and_teardown(self): @pytest.mark.asyncio async def test_engines_uninstall_llamacpp_should_be_successfully(self): - response = requests.post("http://localhost:3928/v1/engines/llama-cpp/install") + data = {"version": "b5371"} + response = requests.post( + "http://localhost:3928/v1/engines/llama-cpp/install", json=data + ) await wait_for_websocket_download_success_event(timeout=None) exit_code, output, error = run( "Uninstall engine", ["engines", "uninstall", "llama-cpp"] diff --git a/engine/e2e-test/cli/model/test_cli_model.py b/engine/e2e-test/cli/model/test_cli_model.py index aa6e99e4a..cd80a9e2b 100644 --- a/engine/e2e-test/cli/model/test_cli_model.py +++ b/engine/e2e-test/cli/model/test_cli_model.py @@ -36,6 +36,7 @@ def setup_and_teardown(self): run("Delete model", ["models", "delete", "tinyllama:1b"]) stop_server() + @pytest.mark.skipif(platform.system() == "Windows", reason="Skip test for Windows") def test_model_pull_with_direct_url_should_be_success(self): exit_code, output, error = run( "Pull model", diff --git a/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py b/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py index 9fc296d60..ea3cae242 100644 --- a/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py +++ b/engine/e2e-test/runner/cortex-llamacpp-e2e-nightly.py @@ -21,7 +21,7 @@ from api.engines.test_api_get_default_engine import TestApiDefaultEngine from api.engines.test_api_get_engine_release import TestApiEngineRelease from api.engines.test_api_get_engine_release_latest import TestApiEngineReleaseLatest -from test_api_post_default_engine import TestApiSetDefaultEngine +from api.engines.test_api_post_default_engine import TestApiSetDefaultEngine from api.model.test_api_model import TestApiModel from api.model.test_api_model_import import TestApiModelImport from api.files.test_api_create_file import TestApiCreateFile diff --git a/engine/e2e-test/runner/main.py b/engine/e2e-test/runner/main.py index 49bdc5131..8a98d0ca3 100644 --- a/engine/e2e-test/runner/main.py +++ b/engine/e2e-test/runner/main.py @@ -21,7 +21,7 @@ from api.engines.test_api_get_default_engine import TestApiDefaultEngine from api.engines.test_api_get_engine_release import TestApiEngineRelease from api.engines.test_api_get_engine_release_latest import TestApiEngineReleaseLatest -from test_api_post_default_engine import TestApiSetDefaultEngine +from api.engines.test_api_post_default_engine import TestApiSetDefaultEngine from api.model.test_api_model import TestApiModel from api.model.test_api_model_import import TestApiModelImport from api.files.test_api_create_file import TestApiCreateFile diff --git a/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py b/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py index 7a3c2e232..a22000d93 100644 --- a/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py +++ b/engine/e2e-test/test_api_cortexso_hub_llamacpp_engine.py @@ -125,7 +125,7 @@ async def test_models_on_cortexso_hub(self, model_url): "Install Engine", ["engines", "install", "llama-cpp"], timeout=None, capture = False ) root = Path.home() - assert os.path.exists(root / "cortexcpp" / "engines" / "cortex.llamacpp" / "version.txt") + assert os.path.exists(root / "cortexcpp" / "engines" / "llama.cpp" / "version.txt") assert exit_code == 0, f"Install engine failed with error: {error}" # Start the model diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc new file mode 100644 index 000000000..74bf0d1b8 --- /dev/null +++ b/engine/extensions/local-engine/local_engine.cc @@ -0,0 +1,1087 @@ +#include "local_engine.h" +#include +#include +#include +#include +#include +#include +#include "utils/curl_utils.h" +#include "utils/json_helper.h" +#include "utils/logging_utils.h" +#include "utils/process/utils.h" +#include "utils/url_parser.h" + +namespace cortex::local { + +namespace { +const std::unordered_set kIgnoredParams = { + "model", "model_alias", "embedding", "ai_prompt", + "ai_template", "prompt_template", "mmproj", "system_prompt", + "created", "stream", "name", "os", + "owned_by", "files", "gpu_arch", "quantization_method", + "engine", "system_template", "max_tokens", "user_template", + "user_prompt", "min_keep", "mirostat", "mirostat_eta", + "mirostat_tau", "text_model", "version", "n_probs", + "object", "penalize_nl", "precision", "size", + "stop", "tfs_z", "typ_p", "caching_enabled"}; + +const std::unordered_map kParamsMap = { + {"cpu_threads", "--threads"}, + {"n_ubatch", "--ubatch-size"}, + {"n_batch", "--batch-size"}, + {"n_parallel", "--parallel"}, + {"temperature", "--temp"}, + {"top_k", "--top-k"}, + {"top_p", "--top-p"}, + {"min_p", "--min-p"}, + {"dynatemp_exponent", "--dynatemp-exp"}, + {"ctx_len", "--ctx-size"}, + {"ngl", "-ngl"}, + {"reasoning_budget", "--reasoning-budget"}, +}; + +int GenerateRandomInteger(int min, int max) { + static std::random_device rd; // Seed for the random number engine + static std::mt19937 gen(rd()); // Mersenne Twister random number engine + std::uniform_int_distribution<> dis( + min, max); // Distribution for the desired range + + return dis(gen); +} + +std::vector ConvertJsonToParamsVector(const Json::Value& root) { + std::vector res; + std::string errors; + res.push_back("--no-webui"); + + for (const auto& member : root.getMemberNames()) { + if (member == "model_path" || member == "llama_model_path") { + if (!root[member].isNull()) { + const std::string path = root[member].asString(); + res.push_back("--model"); + res.push_back(path); + + // If path contains both "Jan" and "nano", case-insensitive, add special params + std::string lowered = path; + std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { + return std::tolower(c); + }); + } + continue; + } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { + continue; + } else if (kParamsMap.find(member) != kParamsMap.end()) { + res.push_back(kParamsMap.at(member)); + res.push_back(root[member].asString()); + continue; + } else if (member == "model_type") { + if (root[member].asString() == "embedding") { + res.push_back("--embedding"); + } + continue; + } else if (member == "cache_type") { + if (!root[member].isNull()) { + res.push_back("-ctk"); + res.push_back(root[member].asString()); + res.push_back("-ctv"); + res.push_back(root[member].asString()); + } + continue; + } else if (member == "use_mmap") { + if (!root[member].asBool()) { + res.push_back("--no-mmap"); + } + continue; + } else if (member == "ignore_eos") { + if (root[member].asBool()) { + res.push_back("--ignore_eos"); + } + continue; + } else if (member == "ctx_len") { + if (!root[member].isNull()) { + res.push_back("--ctx-size"); + res.push_back(root[member].asString()); + } + continue; + } + + // Generic handling for other members + res.push_back("--" + member); + if (root[member].isString()) { + res.push_back(root[member].asString()); + } else if (root[member].isInt()) { + res.push_back(std::to_string(root[member].asInt())); + } else if (root[member].isDouble()) { + res.push_back(std::to_string(root[member].asDouble())); + } else if (root[member].isArray()) { + std::stringstream ss; + ss << "["; + bool first = true; + for (const auto& value : root[member]) { + if (!first) { + ss << ", "; + } + ss << "\"" << value.asString() << "\""; + first = false; + } + ss << "]"; + res.push_back(ss.str()); + } + } + + return res; +} + + +constexpr const auto kMinDataChunkSize = 6u; + +struct OaiInfo { + std::string model; + bool include_usage = false; + bool oai_endpoint = false; + int n_probs = 0; +}; + +struct StreamingCallback { + std::shared_ptr callback; + bool need_stop = true; + OaiInfo oi; +}; + +struct Usage { + int prompt_tokens = 0; + int completion_tokens = 0; +}; + +std::string GenerateRandomString(std::size_t length) { + const std::string characters = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + + std::random_device rd; + std::mt19937 generator(rd()); + + std::uniform_int_distribution<> distribution( + 0, static_cast(characters.size()) - 1); + + std::string random_string(length, '\0'); + std::generate_n(random_string.begin(), length, + [&]() { return characters[distribution(generator)]; }); + + return random_string; +} + +std::vector GetUTF8Bytes(const std::string& str) { + std::vector bytes; + for (unsigned char c : str) { + bytes.push_back(static_cast(c)); + } + return bytes; +} + +Json::Value TransformLogProbs(const Json::Value& logprobs) { + Json::Value root; + Json::Value logprobs_json(Json::arrayValue); + + // Iterate through each token group in the input + for (const auto& token_group : logprobs) { + Json::Value content_item; + + // Set the token (content) + content_item["token"] = token_group["content"].asString(); + + // Get the probabilities array + const auto& probs = token_group["probs"]; + + // Set the main token's logprob (first probability) + if (!probs.empty()) { + content_item["logprob"] = std::log( + probs[0]["prob"].asDouble() + std::numeric_limits::epsilon()); + } + + // Get UTF-8 bytes for the token + auto bytes = GetUTF8Bytes(token_group["content"].asString()); + Json::Value bytes_array(Json::arrayValue); + for (int byte : bytes) { + bytes_array.append(byte); + } + content_item["bytes"] = bytes_array; + + // Create top_logprobs array + Json::Value top_logprobs(Json::arrayValue); + for (const auto& prob_item : probs) { + Json::Value logprob_item; + logprob_item["token"] = prob_item["tok_str"].asString(); + logprob_item["logprob"] = + std::log(prob_item["prob"].asDouble() + + std::numeric_limits::epsilon()); + + // Get UTF-8 bytes for this alternative token + auto alt_bytes = GetUTF8Bytes(prob_item["tok_str"].asString()); + Json::Value alt_bytes_array(Json::arrayValue); + for (int byte : alt_bytes) { + alt_bytes_array.append(byte); + } + logprob_item["bytes"] = alt_bytes_array; + + top_logprobs.append(logprob_item); + } + content_item["top_logprobs"] = top_logprobs; + + logprobs_json.append(content_item); + } + root["content"] = logprobs_json; + return root; +} + +std::string CreateReturnJson( + const std::string& id, const std::string& model, const std::string& content, + Json::Value finish_reason, bool include_usage, + std::optional usage = std::nullopt, + std::optional logprobs = std::nullopt) { + Json::Value root; + + root["id"] = id; + root["model"] = model; + root["created"] = static_cast(std::time(nullptr)); + root["object"] = "chat.completion.chunk"; + + Json::Value choicesArray(Json::arrayValue); + // If usage, the choices field will always be an empty array + if (!usage) { + Json::Value choice; + + choice["index"] = 0; + Json::Value delta; + delta["content"] = content; + delta["role"] = "assistant"; + choice["delta"] = delta; + choice["finish_reason"] = finish_reason; + if (logprobs.has_value() && !logprobs.value().empty()) { + choice["logprobs"] = TransformLogProbs(logprobs.value()); + } + + choicesArray.append(choice); + } + root["choices"] = choicesArray; + if (include_usage) { + if (usage) { + Json::Value usage_json; + Json::Value details; + details["reasoning_tokens"] = 0; + usage_json["prompt_tokens"] = (*usage).prompt_tokens; + usage_json["completion_tokens"] = (*usage).completion_tokens; + usage_json["total_tokens"] = + (*usage).prompt_tokens + (*usage).completion_tokens; + usage_json["completion_tokens_details"] = details; + root["usage"] = usage_json; + } else { + root["usage"] = Json::Value(); + } + } + + Json::StreamWriterBuilder writer; + writer["indentation"] = ""; // This sets the indentation to an empty string, + // producing compact output. + return Json::writeString(writer, root); +} + +size_t WriteCallback(char* ptr, size_t size, size_t nmemb, void* userdata) { + auto* sc = static_cast(userdata); + size_t data_length = size * nmemb; + + if (ptr && data_length > kMinDataChunkSize) { + std::string chunk(ptr + kMinDataChunkSize, data_length - kMinDataChunkSize); + CTL_DBG(chunk); + if (sc->oi.oai_endpoint) { + if (chunk.find("[DONE]") != std::string::npos) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + Json::Value chunk_json; + chunk_json["data"] = "data: [DONE]"; + sc->need_stop = false; + (*sc->callback)(std::move(status), std::move(chunk_json)); + return data_length; + } + if (!sc->oi.include_usage && + chunk.find("completion_tokens") != std::string::npos) { + return data_length; + } + + Json::Value chunk_json; + chunk_json["data"] = "data: " + chunk; + Json::Value status; + status["is_done"] = false; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc->callback)(std::move(status), std::move(chunk_json)); + } else { + if (chunk.find("[DONE]") != std::string::npos) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + Json::Value chunk_json; + chunk_json["data"] = "data: [DONE]"; + sc->need_stop = false; + (*sc->callback)(std::move(status), std::move(chunk_json)); + return data_length; + } + auto json_data = json_helper::ParseJsonString(chunk); + // DONE + if (!json_data.isNull() && json_data.isMember("timings")) { + std::optional u; + if (sc->oi.include_usage) { + u = Usage{json_data["tokens_evaluated"].asInt(), + json_data["tokens_predicted"].asInt()}; + } + + Json::Value chunk_json; + chunk_json["data"] = + "data: " + CreateReturnJson(GenerateRandomString(20), sc->oi.model, + "", "stop", sc->oi.include_usage, u); + Json::Value status; + status["is_done"] = false; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc->callback)(std::move(status), std::move(chunk_json)); + + sc->need_stop = false; + return data_length; + } + + Json::Value logprobs; + if (sc->oi.n_probs > 0) { + logprobs = json_data["completion_probabilities"]; + } + std::string to_send; + if (json_data.isMember("choices") && json_data["choices"].isArray() && + json_data["choices"].size() > 0) { + to_send = json_data["choices"][0].get("text", "").asString(); + } + CTL_DBG(to_send); + const std::string str = + CreateReturnJson(GenerateRandomString(20), sc->oi.model, to_send, "", + sc->oi.include_usage, std::nullopt, logprobs); + Json::Value chunk_json; + chunk_json["data"] = "data: " + str; + Json::Value status; + status["is_done"] = false; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc->callback)(std::move(status), std::move(chunk_json)); + return data_length; + } + } + + return data_length; +} + +Json::Value ConvertLogitBiasToArray(const Json::Value& input) { + Json::Value result(Json::arrayValue); + if (input.isObject()) { + const auto& member_names = input.getMemberNames(); + for (const auto& tokenStr : member_names) { + Json::Value pair(Json::arrayValue); + pair.append(std::stoi(tokenStr)); + pair.append(input[tokenStr].asFloat()); + result.append(pair); + } + } + return result; +} + +Json::Value CreateFullReturnJson( + const std::string& id, const std::string& model, const std::string& content, + const std::string& system_fingerprint, int prompt_tokens, + int completion_tokens, Json::Value finish_reason = Json::Value(), + std::optional logprobs = std::nullopt) { + Json::Value root; + + root["id"] = id; + root["model"] = model; + root["created"] = static_cast(std::time(nullptr)); + root["object"] = "chat.completion"; + root["system_fingerprint"] = system_fingerprint; + + Json::Value choicesArray(Json::arrayValue); + Json::Value choice; + + choice["index"] = 0; + Json::Value message; + message["role"] = "assistant"; + message["content"] = content; + choice["message"] = message; + choice["finish_reason"] = finish_reason; + if (logprobs.has_value() && !logprobs.value().empty()) { + choice["logprobs"] = TransformLogProbs(logprobs.value()); + } + + choicesArray.append(choice); + root["choices"] = choicesArray; + + Json::Value usage; + usage["prompt_tokens"] = prompt_tokens; + usage["completion_tokens"] = completion_tokens; + usage["total_tokens"] = prompt_tokens + completion_tokens; + root["usage"] = usage; + + return root; +} + +} // namespace + +LocalEngine::~LocalEngine() { + for (auto& [_, si] : server_map_) { + (void)cortex::process::KillProcess(si.process_info); + } + server_map_.clear(); +} +void LocalEngine::HandleChatCompletion(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + auto& s = server_map_[model_id]; + auto oaicompat = [&json_body]() -> bool { + if (json_body->isMember("logprobs") && + (*json_body)["logprobs"].asBool()) { + return false; + } + return true; + }(); + if (oaicompat) { + HandleOpenAiChatCompletion( + json_body, const_cast(callback), model_id); + } else { + HandleNonOpenAiChatCompletion( + json_body, const_cast(callback), model_id); + } + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::HandleEmbedding(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + auto& s = server_map_[model_id]; + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ s.host + ":" + std::to_string(s.port), + /*.pathParams*/ {"v1", "embeddings"}, + /* .queries = */ {}, + }; + + auto response = curl_utils::SimplePostJson(url.ToFullPath(), + json_body->toStyledString()); + + if (response.has_error()) { + CTL_WRN("Error: " << response.error()); + Json::Value error; + error["error"] = response.error(); + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } else { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response.value())); + } + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::LoadModel(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + CTL_INF("Model " << model_id << " is already loaded"); + Json::Value error; + error["error"] = "Model " + model_id + " is already loaded"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 409; + callback(std::move(status), std::move(error)); + return; + } + + CTL_INF("Start loading model"); + auto wait_for_server_up = [this](const std::string& model, + const std::string& host, int port) { + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ host + ":" + std::to_string(port), + /*.pathParams*/ {"health"}, + /*.queries*/ {}, + }; + while (server_map_.find(model) != server_map_.end()) { + auto res = curl_utils::SimpleGet(url.ToFullPath()); + if (res.has_error()) { + LOG_INFO << "Wait for server up .."; + std::this_thread::sleep_for(std::chrono::seconds(1)); + } else { + return true; + } + } + return false; + }; + + LOG_DEBUG << "Start to spawn llama-server"; + + server_map_[model_id].host = "127.0.0.1"; + server_map_[model_id].port = GenerateRandomInteger(39400, 39999); + auto& s = server_map_[model_id]; + s.pre_prompt = json_body->get("pre_prompt", "").asString(); + s.user_prompt = json_body->get("user_prompt", "USER: ").asString(); + s.ai_prompt = json_body->get("ai_prompt", "ASSISTANT: ").asString(); + s.system_prompt = + json_body->get("system_prompt", "ASSISTANT's RULE: ").asString(); + std::vector params = ConvertJsonToParamsVector(*json_body); + params.push_back("--host"); + params.push_back(s.host); + params.push_back("--port"); + params.push_back(std::to_string(s.port)); + + + params.push_back("--jinja"); + + std::vector v; + v.reserve(params.size() + 1); + auto engine_dir = engine_service_.GetEngineDirPath(kLlamaRepo); + if (engine_dir.has_error()) { + CTL_WRN(engine_dir.error()); + server_map_.erase(model_id); + return; + } + auto exe = (engine_dir.value().first / kLlamaServer).string(); + + v.push_back(exe); + v.insert(v.end(), params.begin(), params.end()); + engine_service_.RegisterEngineLibPath(); + + auto log_path = + (file_manager_utils::GetCortexLogPath() / "logs" / "cortex.log").string(); + CTL_DBG("log: " << log_path); + auto result = cortex::process::SpawnProcess(v, log_path, log_path); + if (result.has_error()) { + CTL_ERR("Fail to spawn process. " << result.error()); + Json::Value error; + error["error"] = "Fail to spawn process"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(error)); + server_map_.erase(model_id); + return; + } + + s.process_info = result.value(); + if (wait_for_server_up(model_id, s.host, s.port)) { + s.start_time = std::chrono::system_clock::now().time_since_epoch() / + std::chrono::milliseconds(1); + Json::Value response; + response["status"] = "Model loaded successfully with pid: " + + std::to_string(s.process_info.pid); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response)); + } else { + server_map_.erase(model_id); + Json::Value error; + error["error"] = "Wait for server up timeout"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::UnloadModel(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + + if (server_map_.find(model_id) != server_map_.end()) { + auto& s = server_map_[model_id]; +#if defined(_WIN32) || defined(_WIN64) + auto sent = cortex::process::KillProcess(s.process_info); +#else + auto sent = (kill(s.process_info.pid, SIGTERM) != -1); +#endif + if (sent) { + LOG_INFO << "SIGINT signal sent to child process"; + Json::Value response; + response["status"] = "Model unloaded successfully with pid: " + + std::to_string(s.process_info.pid); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response)); + server_map_.erase(model_id); + } else { + LOG_ERROR << "Failed to send SIGINT signal to child process"; + Json::Value error; + error["error"] = "Failed to unload model: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(error)); + } + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::GetModelStatus(std::shared_ptr json_body, + http_callback&& callback) { + auto model_id = json_body->get("model", "").asString(); + if (model_id.empty()) { + CTL_WRN("Model is empty"); + } + if (server_map_.find(model_id) != server_map_.end()) { + Json::Value response; + response["status"] = "Model is loaded"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(response)); + } else { + Json::Value error; + error["error"] = "Model is not loaded yet: " + model_id; + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 400; + callback(std::move(status), std::move(error)); + } +} + +void LocalEngine::GetModels(std::shared_ptr json_body, + http_callback&& callback) { + Json::Value json_resp; + Json::Value model_array(Json::arrayValue); + { + for (const auto& [m, s] : server_map_) { + Json::Value val; + val["id"] = m; + val["engine"] = kLlamaEngine; + val["start_time"] = s.start_time; + val["model_size"] = 0u; + val["vram"] = 0u; + val["ram"] = 0u; + val["object"] = "model"; + model_array.append(val); + } + } + + json_resp["object"] = "list"; + json_resp["data"] = model_array; + + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(json_resp)); + CTL_INF("Running models responded"); + (void)json_body; +} + +void LocalEngine::HandleOpenAiChatCompletion( + std::shared_ptr json_body, http_callback&& callback, + const std::string& model) { + CTL_DBG("Hanle OpenAI chat completion"); + auto is_stream = (*json_body).get("stream", false).asBool(); + auto include_usage = [&json_body, is_stream]() -> bool { + if (is_stream) { + if (json_body->isMember("stream_options") && + !(*json_body)["stream_options"].isNull()) { + return (*json_body)["stream_options"] + .get("include_usage", false) + .asBool(); + } + return false; + } + return false; + }(); + + auto n = [&json_body, is_stream]() -> int { + if (is_stream) + return 1; + return (*json_body).get("n", 1).asInt(); + }(); + + auto& s = server_map_.at(model); + // Format logit_bias + if (json_body->isMember("logit_bias")) { + auto logit_bias = ConvertLogitBiasToArray((*json_body)["logit_bias"]); + (*json_body)["logit_bias"] = logit_bias; + } + // llama.cpp server only supports n = 1 + (*json_body)["n"] = 1; + + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ s.host + ":" + std::to_string(s.port), + /*.pathParams*/ {"v1", "chat", "completions"}, + /*.queries*/ {}, + }; + + if (is_stream) { + q_.RunInQueue([s, json_body, callback, model, url = std::move(url)] { + auto curl = curl_easy_init(); + if (!curl) { + CTL_WRN("Failed to initialize CURL"); + return; + } + + curl_easy_setopt(curl, CURLOPT_URL, url.ToFullPath().c_str()); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + CTL_INF(url.ToFullPath()); + + struct curl_slist* headers = nullptr; + headers = curl_slist_append(headers, "Content-Type: application/json"); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + auto json_str = json_body->toStyledString(); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str()); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length()); + curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L); + + StreamingCallback sc; + OaiInfo oi{model, false /*include_usage*/, true /*oai_endpoint*/, + 0 /*n_probs*/}; + sc.callback = std::make_shared(callback); + sc.need_stop = true; + sc.oi = oi; + + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &sc); + auto res = curl_easy_perform(curl); + + if (res != CURLE_OK) { + CTL_WRN("CURL request failed: " << curl_easy_strerror(res)); + + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = true; + status["status_code"] = 500; + + Json::Value error; + error["error"] = curl_easy_strerror(res); + callback(std::move(status), std::move(error)); + } + curl_easy_cleanup(curl); + if (sc.need_stop) { + CTL_DBG("No stop message received, need to stop"); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc.callback)(std::move(status), Json::Value()); + } + }); + + } else { + Json::Value result; + // multiple choices + for (int i = 0; i < n; i++) { + auto response = curl_utils::SimplePostJson(url.ToFullPath(), + json_body->toStyledString()); + + if (response.has_value()) { + auto r = response.value(); + if (i == 0) { + result = r; + } else { + r["choices"][0]["index"] = i; + result["choices"].append(r["choices"][0]); + result["usage"]["completion_tokens"] = + result["usage"]["completion_tokens"].asInt() + + r["usage"]["completion_tokens"].asInt(); + result["usage"]["prompt_tokens"] = + result["usage"]["prompt_tokens"].asInt() + + r["usage"]["prompt_tokens"].asInt(); + result["usage"]["total_tokens"] = + result["usage"]["total_tokens"].asInt() + + r["usage"]["total_tokens"].asInt(); + } + + if (i == n - 1) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(result)); + } + } else { + CTL_WRN("Error: " << response.error()); + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(response.value())); + break; + } + } + } +} + +// (sang) duplicate code but it is easier to clean when +// llama-server upstream is fully OpenAI API Compatible +void LocalEngine::HandleNonOpenAiChatCompletion( + std::shared_ptr json_body, http_callback&& callback, + const std::string& model) { + CTL_DBG("Hanle NonOpenAI chat completion"); + auto is_stream = (*json_body).get("stream", false).asBool(); + auto include_usage = [&json_body, is_stream]() -> bool { + if (is_stream) { + if (json_body->isMember("stream_options") && + !(*json_body)["stream_options"].isNull()) { + return (*json_body)["stream_options"] + .get("include_usage", false) + .asBool(); + } + return false; + } + return false; + }(); + + auto n = [&json_body, is_stream]() -> int { + if (is_stream) + return 1; + return (*json_body).get("n", 1).asInt(); + }(); + + auto& s = server_map_.at(model); + + // Format logit_bias + if (json_body->isMember("logit_bias")) { + auto logit_bias = ConvertLogitBiasToArray((*json_body)["logit_bias"]); + (*json_body)["logit_bias"] = logit_bias; + } + auto get_message = [](const Json::Value& msg_content) -> std::string { + if (msg_content.isArray()) { + for (const auto& mc : msg_content) { + if (mc["type"].asString() == "text") { + return mc["text"].asString(); + } + } + } else { + return msg_content.asString(); + } + return ""; + }; + + if (!json_body->isMember("prompt") || + (*json_body)["prompt"].asString().empty()) { + auto formatted_output = s.pre_prompt; + for (const auto& message : (*json_body)["messages"]) { + auto input_role = message["role"].asString(); + std::string role; + if (input_role == "user") { + role = s.user_prompt; + } else if (input_role == "assistant") { + role = s.ai_prompt; + } else if (input_role == "system") { + role = s.system_prompt; + } else { + role = input_role; + } + + if (auto content = get_message(message["content"]); !content.empty()) { + formatted_output += role + content; + } + } + formatted_output += s.ai_prompt; + (*json_body)["prompt"] = formatted_output; + } + + (*json_body)["n"] = 1; + int n_probs = json_body->get("n_probs", 0).asInt(); + + auto url = url_parser::Url{ + /*.protocol*/ "http", + /*.host*/ s.host + ":" + std::to_string(s.port), + /*.pathParams*/ {"v1", "completions"}, + /*.queries*/ {}, + }; + + if (is_stream) { + q_.RunInQueue([s, json_body, callback, n_probs, model, + url = std::move(url)] { + auto curl = curl_easy_init(); + if (!curl) { + CTL_WRN("Failed to initialize CURL"); + return; + } + + curl_easy_setopt(curl, CURLOPT_URL, url.ToFullPath().c_str()); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + + struct curl_slist* headers = nullptr; + headers = curl_slist_append(headers, "Content-Type: application/json"); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + auto json_str = json_body->toStyledString(); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, json_str.c_str()); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, json_str.length()); + curl_easy_setopt(curl, CURLOPT_TCP_KEEPALIVE, 1L); + + StreamingCallback sc; + OaiInfo oi{model, false /*include_usage*/, false /*oai_endpoint*/, + n_probs}; + sc.callback = std::make_shared(callback); + sc.need_stop = true; + sc.oi = oi; + + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &sc); + auto res = curl_easy_perform(curl); + + if (res != CURLE_OK) { + CTL_WRN("CURL request failed: " << curl_easy_strerror(res)); + + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = true; + status["status_code"] = 500; + + Json::Value error; + error["error"] = curl_easy_strerror(res); + callback(std::move(status), std::move(error)); + } + curl_easy_cleanup(curl); + if (sc.need_stop) { + CTL_DBG("No stop message received, need to stop"); + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = true; + status["status_code"] = 200; + (*sc.callback)(std::move(status), Json::Value()); + } + }); + + } else { + + Json::Value result; + int prompt_tokens = 0; + int predicted_tokens = 0; + // multiple choices + for (int i = 0; i < n; i++) { + auto response = curl_utils::SimplePostJson(url.ToFullPath(), + json_body->toStyledString()); + if (response.has_value()) { + auto r = response.value(); + Json::Value logprobs; + prompt_tokens += r["tokens_evaluated"].asInt(); + predicted_tokens += r["tokens_predicted"].asInt(); + std::string to_send = r["content"].asString(); + string_utils::LTrim(to_send); + if (n_probs > 0) { + logprobs = r["completion_probabilities"]; + } + if (i == 0) { + result = CreateFullReturnJson( + GenerateRandomString(20), model, to_send, "_", prompt_tokens, + predicted_tokens, Json::Value("stop"), logprobs); + } else { + auto choice = CreateFullReturnJson( + GenerateRandomString(20), model, to_send, "_", prompt_tokens, + predicted_tokens, Json::Value("stop"), logprobs)["choices"][0]; + choice["index"] = i; + result["choices"].append(choice); + result["usage"]["completion_tokens"] = predicted_tokens; + result["usage"]["prompt_tokens"] = prompt_tokens; + result["usage"]["total_tokens"] = predicted_tokens + prompt_tokens; + } + + if (i == n - 1) { + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = 200; + callback(std::move(status), std::move(result)); + } + } else { + CTL_WRN("Error: " << response.error()); + Json::Value status; + status["is_done"] = true; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = 500; + callback(std::move(status), std::move(response.value())); + break; + } + } + } +} + +} // namespace cortex::local diff --git a/engine/extensions/local-engine/local_engine.h b/engine/extensions/local-engine/local_engine.h new file mode 100644 index 000000000..6dd970799 --- /dev/null +++ b/engine/extensions/local-engine/local_engine.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include +#include +#include +#include "cortex-common/EngineI.h" +#include "json/json.h" +#include "services/engine_service.h" +#include "utils/process/utils.h" +#include "utils/task_queue.h" + +namespace cortex::local { +using http_callback = std::function; + +struct ServerAddress { + std::string host; + int port; + cortex::process::ProcessInfo process_info; + std::string pre_prompt; + std::string user_prompt; + std::string ai_prompt; + std::string system_prompt; + uint64_t start_time; +}; + +class LocalEngine : public EngineI { + public: + LocalEngine(EngineService& engine_service, TaskQueue& q) + : engine_service_(engine_service), q_(q) {} + ~LocalEngine(); + + void Load(EngineLoadOption opts) final {} + + void Unload(EngineUnloadOption opts) final {} + + void HandleChatCompletion(std::shared_ptr json_body, + http_callback&& callback) final; + void HandleEmbedding(std::shared_ptr json_body, + http_callback&& callback) final; + void LoadModel(std::shared_ptr json_body, + http_callback&& callback) final; + void UnloadModel(std::shared_ptr json_body, + http_callback&& callback) final; + void GetModelStatus(std::shared_ptr json_body, + http_callback&& callback) final; + + // Get list of running models + void GetModels(std::shared_ptr jsonBody, + http_callback&& callback) final; + + bool SetFileLogger(int max_log_lines, const std::string& log_path) final { + return true; + } + void SetLogLevel(trantor::Logger::LogLevel logLevel) final {} + + // Stop inflight chat completion in stream mode + void StopInferencing(const std::string& model_id) final {} + + private: + void HandleOpenAiChatCompletion(std::shared_ptr json_body, + http_callback&& callback, + const std::string& model); + + void HandleNonOpenAiChatCompletion(std::shared_ptr json_body, + http_callback&& callback, + const std::string& model); + + private: + std::unordered_map server_map_; + EngineService& engine_service_; + TaskQueue& q_; +}; + +} // namespace cortex::local diff --git a/engine/main.cc b/engine/main.cc index ab4e74857..abde0441b 100644 --- a/engine/main.cc +++ b/engine/main.cc @@ -196,15 +196,16 @@ void RunServer(bool ignore_cout) { auto config_service = std::make_shared(); auto download_service = std::make_shared(event_queue_ptr, config_service); + auto task_queue = std::make_shared( + std::min(2u, std::thread::hardware_concurrency()), "background_task"); auto engine_service = std::make_shared( - download_service, dylib_path_manager, db_service); + download_service, dylib_path_manager, db_service, task_queue); auto inference_svc = std::make_shared(engine_service); auto model_src_svc = std::make_shared(db_service); - cortex::TaskQueue task_queue( - std::min(2u, std::thread::hardware_concurrency()), "background_task"); - auto model_service = - std::make_shared(db_service, hw_service, download_service, - inference_svc, engine_service, task_queue); + + auto model_service = std::make_shared( + db_service, hw_service, download_service, inference_svc, engine_service, + *task_queue); inference_svc->SetModelService(model_service); auto file_watcher_srv = std::make_shared( diff --git a/engine/repositories/file_fs_repository.cc b/engine/repositories/file_fs_repository.cc index f5b349f45..67c0981ba 100644 --- a/engine/repositories/file_fs_repository.cc +++ b/engine/repositories/file_fs_repository.cc @@ -18,14 +18,10 @@ std::filesystem::path SanitizePath(const std::filesystem::path& user_input, std::filesystem::path resolved_path = std::filesystem::weakly_canonical( std::filesystem::path(basedir) / std::filesystem::path(user_input)); /* Ensure the resolved path is within our basedir */ - for (auto p = resolved_path; !p.empty(); p = p.parent_path()) { - if (std::filesystem::equivalent(p, abs_base)) { - return resolved_path; - } - if (p == p.parent_path()) { // reached the root directory - break; - } + if (resolved_path.string().find(abs_base.string()) != std::string::npos) { + return resolved_path; } + return {}; } diff --git a/engine/services/engine_service.cc b/engine/services/engine_service.cc index 48cc6ff37..15c7148c7 100644 --- a/engine/services/engine_service.cc +++ b/engine/services/engine_service.cc @@ -9,6 +9,7 @@ #include "config/model_config.h" #include "database/engines.h" #include "database/models.h" +#include "extensions/local-engine/local_engine.h" #include "extensions/remote-engine/remote_engine.h" #include "utils/archive_utils.h" @@ -16,6 +17,7 @@ #include "utils/engine_matcher_utils.h" #include "utils/file_manager_utils.h" #include "utils/github_release_utils.h" +#include "utils/hardware/os_info.h" #include "utils/logging_utils.h" #include "utils/normalize_engine.h" #include "utils/result.hpp" @@ -46,13 +48,6 @@ std::string Repo2Engine(const std::string& r) { } return r; }; - -std::string GetEnginePath(std::string_view e) { - if (e == kLlamaRepo) { - return kLlamaLibPath; - } - return kLlamaLibPath; -}; } // namespace cpp::result EngineService::InstallEngineAsync( @@ -236,11 +231,14 @@ cpp::result EngineService::DownloadEngine( auto latest_version_semantic = normalized_version == "latest" ? res.value()[0].version : normalized_version; - auto merged_variant_name = engine + "-" + latest_version_semantic + "-" + - variant_name.value() + ".tar.gz"; + std::unordered_set merged_variant_name = { + "llama-" + latest_version_semantic + "-bin-" + variant_name.value() + + ".tar.gz", // menlo + "llama-" + latest_version_semantic + "-bin-" + variant_name.value() + + ".zip"}; // ggml for (const auto& asset : res.value()) { - if (asset.name == merged_variant_name) { + if (merged_variant_name.find(asset.name) != merged_variant_name.end()) { selected_variant = asset; break; } @@ -275,43 +273,96 @@ cpp::result EngineService::DownloadEngine( } } - auto normalize_version = "v" + selected_variant->version; auto variant_folder_name = engine_matcher_utils::GetVariantFromNameAndVersion( selected_variant->name, engine, selected_variant->version); auto variant_folder_path = file_manager_utils::GetEnginesContainerPath() / engine / variant_folder_name.value() / - normalize_version; + selected_variant->version; auto variant_path = variant_folder_path / selected_variant->name; std::filesystem::create_directories(variant_folder_path); CTL_INF("variant_folder_path: " + variant_folder_path.string()); - auto on_finished = [this, engine, selected_variant, variant_folder_path, - normalize_version](const DownloadTask& finishedTask) { + auto on_finished = [this, engine, selected_variant, + variant_folder_path](const DownloadTask& finishedTask) { // try to unzip the downloaded file CTL_INF("Engine zip path: " << finishedTask.items[0].localPath.string()); - CTL_INF("Version: " + normalize_version); + CTL_INF("Version: " + selected_variant->version); auto extract_path = finishedTask.items[0].localPath.parent_path(); archive_utils::ExtractArchive(finishedTask.items[0].localPath.string(), extract_path.string(), true); - + CTL_INF("local path: " << finishedTask.items[0].localPath.string() + << ", extract path: " << extract_path.string()); auto variant = engine_matcher_utils::GetVariantFromNameAndVersion( - selected_variant->name, engine, normalize_version); - + selected_variant->name, engine, selected_variant->version); CTL_INF("Extracted variant: " + variant.value()); - // set as default + try { + // Create version file + std::ofstream meta(extract_path / "version.txt", std::ios::out); + meta << "name: " << variant.value() << std::endl; + meta << "version: " << selected_variant->version << std::endl; + meta.close(); + + std::filesystem::path bin_path = extract_path / "build" / "bin"; + if (std::filesystem::exists(bin_path)) { + for (const auto& entry : + std::filesystem::directory_iterator(bin_path)) { + if (entry.is_regular_file()) { + std::filesystem::path target_file = + extract_path / entry.path().filename(); + std::filesystem::copy_file( + entry.path(), target_file, + std::filesystem::copy_options::overwrite_existing); + } + } + std::filesystem::remove_all(bin_path.parent_path()); + } + if (!std::filesystem::exists(extract_path.parent_path().parent_path() / + "deps")) { + std::filesystem::create_directory( + extract_path.parent_path().parent_path() / "deps"); + } + std::filesystem::permissions(extract_path / kLlamaServer, + std::filesystem::perms::owner_exec | + std::filesystem::perms::group_exec | + std::filesystem::perms::others_exec, + std::filesystem::perm_options::add); + + const std::vector windows_deps = { + "msvcp140.dll", "vcruntime140.dll", "vcruntime140_1.dll"}; + for (auto const& win_dep : windows_deps) { + if (std::filesystem::exists( + file_manager_utils::GetExecutableFolderContainerPath() / + win_dep)) { + CTL_INF("Copy file " + << (file_manager_utils::GetExecutableFolderContainerPath() / + win_dep) + .string() + << " to " << extract_path.string()); + std::filesystem::copy_file( + file_manager_utils::GetExecutableFolderContainerPath() / win_dep, + extract_path / win_dep, + std::filesystem::copy_options::overwrite_existing); + } + } + + } catch (const std::exception& e) { + CTL_INF(e.what()); + } - auto res = - SetDefaultEngineVariant(engine, normalize_version, variant.value()); + // set as default + auto res = SetDefaultEngineVariant(engine, selected_variant->version, + variant.value()); if (res.has_error()) { CTL_ERR("Failed to set default engine variant: " << res.error()); } else { CTL_INF("Set default engine variant: " << res.value().variant); } - auto create_res = EngineService::UpsertEngine( - engine, // engine_name - kLocal, "", "", normalize_version, variant.value(), "Default", ""); + auto create_res = + EngineService::UpsertEngine(engine, // engine_name + kLocal, "", "", selected_variant->version, + variant.value(), "Default", ""); if (create_res.has_error()) { CTL_ERR("Failed to create engine entry: " << create_res->engine_name); @@ -322,7 +373,7 @@ cpp::result EngineService::DownloadEngine( for (const auto& entry : std::filesystem::directory_iterator( variant_folder_path.parent_path())) { if (entry.is_directory() && - entry.path().filename() != normalize_version) { + entry.path().filename() != selected_variant->version) { try { std::filesystem::remove_all(entry.path()); } catch (const std::exception& e) { @@ -450,7 +501,26 @@ std::string EngineService::GetMatchedVariant( cpp::result, std::string> EngineService::GetEngineReleases(const std::string& engine) const { auto ne = cortex::engine::NormalizeEngine(engine); - return github_release_utils::GetReleases("menloresearch", ne); + auto ggml_org = github_release_utils::GetReleases(kGgmlOrg, ne); + auto menlo = github_release_utils::GetReleases(kMenloOrg, ne); + if (ggml_org.has_error() && menlo.has_error()) { + return cpp::fail(ggml_org.error()); + } + auto comparator = [](const EngineService::EngineRelease& e1, + const EngineService::EngineRelease& e2) { + return e1.name > e2.name; + }; + std::set s(comparator); + if (ggml_org.has_value()) { + s.insert(ggml_org.value().begin(), ggml_org.value().end()); + } + + if (menlo.has_value()) { + s.insert(menlo.value().begin(), menlo.value().end()); + } + std::vector res; + std::copy(s.begin(), s.end(), std::back_inserter(res)); + return res; } cpp::result, std::string> @@ -458,16 +528,85 @@ EngineService::GetEngineVariants(const std::string& engine, const std::string& version, bool filter_compatible_only) const { auto ne = cortex::engine::NormalizeEngine(engine); - auto engine_release = - github_release_utils::GetReleaseByVersion("menloresearch", ne, version); + auto engine_release_menlo = + github_release_utils::GetReleaseByVersion(kMenloOrg, ne, version); + auto engine_release_ggml = + github_release_utils::GetReleaseByVersion(kGgmlOrg, ne, version); + + if (engine_release_menlo.has_error() && engine_release_ggml.has_error()) { + return cpp::fail("Failed to get engine release: " + + engine_release_menlo.error()); + } + if (engine_release_menlo.has_error()) { + CTL_WRN("Failed to get engine release: " << engine_release_menlo.error()); + } - if (engine_release.has_error()) { - return cpp::fail("Failed to get engine release: " + engine_release.error()); + if (engine_release_ggml.has_error()) { + CTL_WRN("Failed to get engine release: " << engine_release_ggml.error()); } std::vector compatible_variants; - for (const auto& variant : engine_release.value().assets) { - if (variant.content_type != "application/gzip") { + std::vector assets; + + auto get_os_major = []() -> int { + auto os_info = cortex::hw::GetOSInfo(); + // Get os major version + size_t dot_pos = os_info.version.find_first_of("."); + if (dot_pos != std::string::npos) { + try { + return std::stoi(os_info.version.substr(0, dot_pos)); + } catch (const std::exception& e) { + return 0; + } + } else { + // No version found + return 0; + } + }; + + if (engine_release_menlo.has_value()) { + // In case of macos, if os version is 12, we get binary from menlo + std::copy_if( + engine_release_menlo.value().assets.begin(), + engine_release_menlo.value().assets.end(), std::back_inserter(assets), + [get_os_major](const github_release_utils::GitHubAsset& assets) { +#if defined(__APPLE__) && defined(__MACH__) + if ((assets.name.find(kMacOs) == std::string::npos) || + (get_os_major() <= 12 && + assets.name.find(kMacOs) != std::string::npos)) { + return true; + } + return false; +#else + return true; +#endif + }); + } + + if (engine_release_ggml.has_value()) { + // In case of macos, if os version is 12, we get binary from menlo + std::copy_if( + engine_release_ggml.value().assets.begin(), + engine_release_ggml.value().assets.end(), std::back_inserter(assets), + [get_os_major](const github_release_utils::GitHubAsset& assets) { +#if defined(__APPLE__) && defined(__MACH__) + if ((assets.name.find(kMacOs) == std::string::npos) || + (get_os_major() > 12 && + assets.name.find(kMacOs) != std::string::npos)) { + return true; + } + return false; +#else + return true; +#endif + }); + } + + for (const auto& variant : assets) { + CTL_INF("content_type: " << variant.content_type + << ", name: " << variant.name); + if (variant.content_type != "application/gzip" && + variant.content_type != "application/json; charset=utf-8") { continue; } if (variant.state != "uploaded") { @@ -494,30 +633,29 @@ EngineService::GetEngineVariants(const std::string& engine, name.find("mac") != std::string::npos) os_match = true; if (system_info->os == "windows" && - name.find("windows") != std::string::npos) + name.find("win") != std::string::npos) os_match = true; if (system_info->os == "linux" && - name.find("linux") != std::string::npos) + (name.find("linux") != std::string::npos || + name.find("ubuntu") != std::string::npos)) os_match = true; bool arch_match = false; if (system_info->arch == "arm64" && name.find("arm64") != std::string::npos) arch_match = true; - if (system_info->arch == "amd64" && - name.find("amd64") != std::string::npos) + if (system_info->arch == "x64" && + name.find("x64") != std::string::npos) arch_match = true; return !(os_match && arch_match); }), compatible_variants.end()); - if (compatible_variants.empty()) { return cpp::fail("No compatible variants found for system " + system_info->os + "/" + system_info->arch); } } - return compatible_variants; } @@ -550,7 +688,7 @@ EngineService::SetDefaultEngineVariant(const std::string& engine, auto normalized_version = string_utils::RemoveSubstring(version, "v"); auto config = file_manager_utils::GetCortexConfig(); - config.llamacppVersion = "v" + normalized_version; + config.llamacppVersion = normalized_version; config.llamacppVariant = variant; auto result = file_manager_utils::UpdateCortexConfig(config); if (result.has_error()) { @@ -574,10 +712,10 @@ cpp::result EngineService::IsEngineVariantReady( return cpp::fail(installed_engines.error()); } - CLI_LOG("IsEngineVariantReady: " << ne << ", " << normalized_version << ", " + CTL_INF("IsEngineVariantReady: " << ne << ", " << normalized_version << ", " << variant); for (const auto& installed_engine : installed_engines.value()) { - CLI_LOG("Installed: name: " + installed_engine.name + + CTL_INF("Installed: name: " + installed_engine.name + ", version: " + installed_engine.version); if ((installed_engine.name == variant && installed_engine.version == normalized_version) || @@ -634,16 +772,22 @@ EngineService::GetInstalledEngineVariants(const std::string& engine) const { // try to find version.txt auto version_txt_path = version_entry.path() / "version.txt"; if (!std::filesystem::exists(version_txt_path)) { - continue; + // create new one + std::ofstream meta(version_txt_path, std::ios::out); + meta << "name: " << entry.path().filename() << std::endl; + meta << "version: " << version_entry.path().filename() << std::endl; + meta.close(); + CTL_INF("name: " << entry.path().filename().string() << ", version: " + << version_entry.path().filename().string()); } try { auto node = YAML::LoadFile(version_txt_path.string()); auto ev = EngineVariantResponse{ - node["name"].as(), // name - "v" + node["version"].as(), // version - engine, // engine - "", // type + node["name"].as(), // name + node["version"].as(), // version + engine, // engine + "", // type }; variants.push_back(ev); } catch (const YAML::Exception& e) { @@ -696,76 +840,18 @@ cpp::result EngineService::LoadEngine( } return {}; } - - // End hard code - - CTL_INF("Loading engine: " << ne); + if (engines_.find(ne) == engines_.end()) { + CTL_INF("Loading local engine: " << engine_name); #if defined(_WIN32) || defined(_WIN64) || defined(__linux__) - CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string()); + CTL_INF("CPU Info: " << cortex::cpuid::CpuInfo().to_string()); #endif - - auto engine_dir_path_res = GetEngineDirPath(ne); - if (engine_dir_path_res.has_error()) { - return cpp::fail(engine_dir_path_res.error()); + engines_[ne].engine = new cortex::local::LocalEngine(*this, *(q_.get())); + CTL_INF("Loaded engine: " << engine_name); + } else { + CTL_INF("Engine has already been loaded: " << engine_name); } - auto engine_dir_path = engine_dir_path_res.value().first; - auto custom_engine_path = engine_dir_path_res.value().second; - - try { - auto cuda_path = file_manager_utils::GetCudaToolkitPath(ne); - -#if defined(_WIN32) || defined(_WIN64) - // register deps - if (!(getenv("ENGINE_PATH"))) { - std::vector paths{}; - paths.push_back(cuda_path); - paths.push_back(engine_dir_path); - CTL_DBG("Registering dylib for " - << ne << " with " << std::to_string(paths.size()) << " paths."); - for (const auto& path : paths) { - CTL_DBG("Registering path: " << path.string()); - } - - auto reg_result = dylib_path_manager_->RegisterPath(ne, paths); - if (reg_result.has_error()) { - CTL_DBG("Failed register lib paths for: " << ne); - } else { - CTL_DBG("Registered lib paths for: " << ne); - } - } -#endif - - auto dylib = - std::make_unique(engine_dir_path.string(), "engine"); - - auto config = file_manager_utils::GetCortexConfig(); - auto log_path = std::filesystem::path(config.logFolderPath) / - std::filesystem::path(config.logLlamaCppPath); - - // init - auto func = dylib->get_function("get_engine"); - auto engine_obj = func(); - auto load_opts = EngineI::EngineLoadOption{ - /* .engine_path = */ engine_dir_path, - /* .deps_path = */ cuda_path, - /* .is_custom_engine_path = */ custom_engine_path, - /* .log_path = */ log_path, - /* .max_log_lines = */ config.maxLogLines, - /* .log_level = */ logging_utils_helper::global_log_level, - }; - engine_obj->Load(load_opts); - - engines_[ne].engine = engine_obj; - engines_[ne].dl = std::move(dylib); - - CTL_DBG("Engine loaded: " << ne); - return {}; - } catch (const cortex_cpp::dylib::load_error& e) { - CTL_ERR("Could not load engine: " << e.what()); - engines_.erase(ne); - return cpp::fail("Could not load engine " + ne + ": " + e.what()); - } + return {}; } void EngineService::RegisterEngineLibPath() { @@ -785,7 +871,9 @@ void EngineService::RegisterEngineLibPath() { // register deps std::vector paths{}; - paths.push_back(cuda_path); + if (std::filesystem::exists(cuda_path)) { + paths.push_back(cuda_path); + } paths.push_back(engine_dir_path); CTL_DBG("Registering dylib for " @@ -796,7 +884,8 @@ void EngineService::RegisterEngineLibPath() { auto reg_result = dylib_path_manager_->RegisterPath(ne, paths); if (reg_result.has_error()) { - CTL_WRN("Failed register lib path for " << engine); + CTL_WRN("Failed register lib path for " + << engine << ", error: " << reg_result.error()); } else { CTL_DBG("Registered lib path for " << engine); } @@ -829,8 +918,8 @@ EngineService::GetEngineDirPath(const std::string& engine_name) { CTL_DBG("user defined engine path: " << user_defined_engine_path); const std::filesystem::path engine_dir_path = [&] { if (user_defined_engine_path != nullptr) { - return std::filesystem::path(user_defined_engine_path) / - GetEnginePath(ne) / selected_engine_variant->variant / + return std::filesystem::path(user_defined_engine_path) / kLlamaLibPath / + selected_engine_variant->variant / selected_engine_variant->version; } else { return file_manager_utils::GetEnginesContainerPath() / ne / @@ -891,8 +980,7 @@ std::vector EngineService::GetLoadedEngines() { cpp::result EngineService::GetLatestEngineVersion(const std::string& engine) const { auto ne = cortex::engine::NormalizeEngine(engine); - auto res = - github_release_utils::GetReleaseByVersion("menloresearch", ne, "latest"); + auto res = github_release_utils::GetReleaseByVersion(kMenloOrg, ne, "latest"); if (res.has_error()) { return cpp::fail("Failed to fetch engine " + engine + " latest version!"); } diff --git a/engine/services/engine_service.h b/engine/services/engine_service.h index 7e6be74c5..0be1fff64 100644 --- a/engine/services/engine_service.h +++ b/engine/services/engine_service.h @@ -19,6 +19,7 @@ #include "utils/github_release_utils.h" #include "utils/result.hpp" #include "utils/system_info_utils.h" +#include "utils/task_queue.h" struct EngineUpdateResult { std::string engine; @@ -44,7 +45,6 @@ class EngineService : public EngineServiceI { using EngineVariant = github_release_utils::GitHubAsset; struct EngineInfo { - std::unique_ptr dl; EngineV engine; }; @@ -60,12 +60,13 @@ class EngineService : public EngineServiceI { }; HardwareInfo hw_inf_; std::shared_ptr db_service_ = nullptr; + std::shared_ptr q_ = nullptr; public: - explicit EngineService( - std::shared_ptr download_service, - std::shared_ptr dylib_path_manager, - std::shared_ptr db_service) + EngineService(std::shared_ptr download_service, + std::shared_ptr dylib_path_manager, + std::shared_ptr db_service, + std::shared_ptr q) : download_service_{download_service}, dylib_path_manager_{dylib_path_manager}, hw_inf_{ @@ -74,9 +75,17 @@ class EngineService : public EngineServiceI { system_info_utils::GetDriverAndCudaVersion() .second // cuda_driver_version. }, + db_service_(db_service), + q_(q) {} - db_service_(db_service) {} - + EngineService(std::shared_ptr dylib_path_manager) + : dylib_path_manager_(dylib_path_manager), + hw_inf_{ + system_info_utils::GetSystemInfo(), // sys_inf. + {}, // cpu_info. + system_info_utils::GetDriverAndCudaVersion() + .second // cuda_driver_version. + } {} std::vector GetEngineInfoList() const; /** @@ -159,6 +168,9 @@ class EngineService : public EngineServiceI { bool IsRemoteEngine(const std::string& engine_name) const override; + cpp::result, std::string> + GetEngineDirPath(const std::string& engine_name); + private: bool IsEngineLoaded(const std::string& engine); @@ -172,9 +184,6 @@ class EngineService : public EngineServiceI { std::string GetMatchedVariant(const std::string& engine, const std::vector& variants); - cpp::result, std::string> - GetEngineDirPath(const std::string& engine_name); - cpp::result IsEngineVariantReady( const std::string& engine, const std::string& version, const std::string& variant); diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc index f0ccadb28..fb2f841be 100644 --- a/engine/services/hardware_service.cc +++ b/engine/services/hardware_service.cc @@ -203,10 +203,8 @@ bool HardwareService::Restart(const std::string& host, int port) { #else std::vector commands; // Some engines requires to add lib search path before process being created - auto download_srv = std::make_shared(); - auto dylib_path_mng = std::make_shared(); - auto db_srv = std::make_shared(); - EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath(); + EngineService(std::make_shared()) + .RegisterEngineLibPath(); std::string p = cortex_utils::GetCurrentPath() / exe; commands.push_back(p); commands.push_back("--ignore_cout"); diff --git a/engine/services/inference_service.cc b/engine/services/inference_service.cc index a1646495b..e07ed71ba 100644 --- a/engine/services/inference_service.cc +++ b/engine/services/inference_service.cc @@ -12,7 +12,7 @@ cpp::result InferenceService::HandleChatCompletion( } else { engine_type = (*(json_body)).get("engine", kLlamaRepo).asString(); } - function_calling_utils::PreprocessRequest(json_body); + CTL_DBG("engine_type: " << engine_type); auto tool_choice = json_body->get("tool_choice", Json::Value::null); auto model_id = json_body->get("model", "").asString(); if (saved_models_.find(model_id) != saved_models_.end()) { @@ -32,6 +32,7 @@ cpp::result InferenceService::HandleChatCompletion( } } } + CTL_DBG("engine_type: " << engine_type); auto engine_result = engine_service_->GetLoadedEngine(engine_type); if (engine_result.has_error()) { @@ -43,51 +44,6 @@ cpp::result InferenceService::HandleChatCompletion( return cpp::fail(std::make_pair(stt, res)); } - if (!model_id.empty()) { - if (auto model_service = model_service_.lock()) { - auto metadata_ptr = model_service->GetCachedModelMetadata(model_id); - if (metadata_ptr != nullptr && - !metadata_ptr->tokenizer->chat_template.empty()) { - auto tokenizer = metadata_ptr->tokenizer; - auto messages = (*json_body)["messages"]; - Json::Value messages_jsoncpp(Json::arrayValue); - for (auto message : messages) { - messages_jsoncpp.append(message); - } - - Json::Value tools(Json::arrayValue); - Json::Value template_data_json; - template_data_json["messages"] = messages_jsoncpp; - // template_data_json["tools"] = tools; - - auto prompt_result = jinja::RenderTemplate( - tokenizer->chat_template, template_data_json, tokenizer->bos_token, - tokenizer->eos_token, tokenizer->add_bos_token, - tokenizer->add_eos_token, tokenizer->add_generation_prompt); - if (prompt_result.has_value()) { - (*json_body)["prompt"] = prompt_result.value(); - if (json_body->isMember("stop")) { - bool need_append = true; - for (auto& s : (*json_body)["stop"]) { - if (s.asString() == tokenizer->eos_token) { - need_append = false; - } - } - if (need_append) { - (*json_body)["stop"].append(tokenizer->eos_token); - } - } else { - Json::Value stops(Json::arrayValue); - stops.append(tokenizer->eos_token); - (*json_body)["stop"] = stops; - } - } else { - CTL_ERR("Failed to render prompt: " + prompt_result.error()); - } - } - } - } - CTL_DBG("Json body inference: " + json_body->toStyledString()); auto cb = [q, tool_choice](Json::Value status, Json::Value res) { @@ -275,9 +231,7 @@ InferResult InferenceService::GetModels( for (const auto& loaded_engine : loaded_engines) { if (std::holds_alternative(loaded_engine)) { auto e = std::get(loaded_engine); - if (e->IsSupported("GetModels")) { - e->GetModels(json_body, std::move(cb)); - } + e->GetModels(json_body, std::move(cb)); } else { std::get(loaded_engine) ->GetModels(json_body, std::move(cb)); @@ -302,10 +256,8 @@ bool InferenceService::StopInferencing(const std::string& engine_name, if (std::holds_alternative(engine_result.value())) { auto engine = std::get(engine_result.value()); - if (engine->IsSupported("StopInferencing")) { - engine->StopInferencing(model_id); - CTL_INF("Stopped inferencing"); - } + engine->StopInferencing(model_id); + CTL_INF("Stopped inferencing"); } return true; } diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index d9359b698..a3771e0a1 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -165,8 +165,8 @@ ModelService::ModelService(std::shared_ptr db_service, download_service_{download_service}, inference_svc_(inference_service), engine_svc_(engine_svc), - task_queue_(task_queue) { - // ProcessBgrTasks(); + task_queue_(task_queue){ + // ProcessBgrTasks(); }; void ModelService::ForceIndexingModelList() { @@ -500,13 +500,10 @@ cpp::result ModelService::DeleteModel( std::filesystem::remove(yaml_fp); CTL_INF("Removed: " << yaml_fp.string()); } else { - // Remove yaml files - for (const auto& entry : - std::filesystem::directory_iterator(yaml_fp.parent_path())) { - if (entry.is_regular_file() && (entry.path().extension() == ".yml")) { - std::filesystem::remove(entry); - CTL_INF("Removed: " << entry.path().string()); - } + // Is a local model - Remove only this model's yaml file + if (std::filesystem::exists(yaml_fp)) { + std::filesystem::remove(yaml_fp); + CTL_INF("Removed: " << yaml_fp.string()); } } @@ -557,6 +554,8 @@ cpp::result ModelService::StartModel( if (auto& o = params_override["ctx_len"]; !o.isNull()) { ctx_len = o.asInt(); } + Json::Value model_load_params; + json_helper::MergeJson(model_load_params, params_override); try { constexpr const int kDefautlContextLength = 8192; @@ -627,9 +626,14 @@ cpp::result ModelService::StartModel( #if defined(_WIN32) json_data["model_path"] = cortex::wc::WstringToUtf8( fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).wstring()); + model_load_params["model_path"] = + cortex::wc::WstringToUtf8( + fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).wstring()); #else json_data["model_path"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); #endif } else { LOG_WARN << "model_path is empty"; @@ -642,6 +646,8 @@ cpp::result ModelService::StartModel( #else json_data["mmproj"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); #endif } json_data["system_prompt"] = mc.system_template; @@ -655,6 +661,7 @@ cpp::result ModelService::StartModel( } json_data["model"] = model_handle; + model_load_params["model"] = model_handle; if (auto& cpt = custom_prompt_template; !cpt.value_or("").empty()) { auto parse_prompt_result = string_utils::ParsePrompt(cpt.value()); json_data["system_prompt"] = parse_prompt_result.system_prompt; @@ -662,8 +669,6 @@ cpp::result ModelService::StartModel( json_data["ai_prompt"] = parse_prompt_result.ai_prompt; } - json_helper::MergeJson(json_data, params_override); - // Set default cpu_threads if it is not configured if (!json_data.isMember("cpu_threads")) { json_data["cpu_threads"] = GetCpuThreads(); @@ -686,26 +691,12 @@ cpp::result ModelService::StartModel( assert(!!inference_svc_); - auto ir = - inference_svc_->LoadModel(std::make_shared(json_data)); + auto ir = inference_svc_->LoadModel( + std::make_shared(model_load_params)); auto status = std::get<0>(ir)["status_code"].asInt(); auto data = std::get<1>(ir); if (status == drogon::k200OK) { - // start model successfully, in case not vision model, we store the metadata so we can use - // for each inference - if (!json_data.isMember("mmproj") || json_data["mmproj"].isNull()) { - auto metadata_res = GetModelMetadata(model_handle); - if (metadata_res.has_value()) { - loaded_model_metadata_map_.emplace(model_handle, - std::move(metadata_res.value())); - CTL_INF("Successfully stored metadata for model " << model_handle); - } else { - CTL_WRN("Failed to get metadata for model " << model_handle << ": " - << metadata_res.error()); - } - } - return StartModelResult{/* .success = */ true, /* .warning = */ may_fallback_res.value()}; } else if (status == drogon::k409Conflict) { @@ -760,8 +751,6 @@ cpp::result ModelService::StopModel( if (bypass_check) { bypass_stop_check_set_.erase(model_handle); } - loaded_model_metadata_map_.erase(model_handle); - CTL_INF("Removed metadata for model " << model_handle); return true; } else { CTL_ERR("Model failed to stop with status code: " << status); @@ -1047,13 +1036,15 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, auto es = hardware::EstimateLLaMACppRun(model_path, rc); if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { - CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB - << ", available: " << free_vram_MiB); + CTL_WRN("Not enough VRAM - " + << "required: " << (*es).gpu_mode.vram_MiB + << ", available: " << free_vram_MiB); } if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) { - CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB - << ", available: " << free_ram_MiB); + CTL_WRN("Not enough RAM - " + << "required: " << (*es).cpu_mode.ram_MiB + << ", available: " << free_ram_MiB); } return warning; @@ -1090,14 +1081,6 @@ ModelService::GetModelMetadata(const std::string& model_id) const { return std::move(*model_metadata_res); } -std::shared_ptr ModelService::GetCachedModelMetadata( - const std::string& model_id) const { - if (loaded_model_metadata_map_.find(model_id) == - loaded_model_metadata_map_.end()) - return nullptr; - return loaded_model_metadata_map_.at(model_id); -} - std::string ModelService::GetEngineByModelId( const std::string& model_id) const { namespace fs = std::filesystem; diff --git a/engine/services/model_service.h b/engine/services/model_service.h index beba91f8c..fa247b954 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -83,9 +83,6 @@ class ModelService { cpp::result, std::string> GetModelMetadata( const std::string& model_id) const; - std::shared_ptr GetCachedModelMetadata( - const std::string& model_id) const; - std::string GetEngineByModelId(const std::string& model_id) const; private: @@ -104,12 +101,6 @@ class ModelService { std::unordered_set bypass_stop_check_set_; std::shared_ptr engine_svc_ = nullptr; - /** - * Store the chat template of loaded model. - */ - std::unordered_map> - loaded_model_metadata_map_; - std::mutex es_mtx_; std::unordered_map> es_; cortex::TaskQueue& task_queue_; diff --git a/engine/services/model_source_service.cc b/engine/services/model_source_service.cc index b5979667c..661b9b580 100644 --- a/engine/services/model_source_service.cc +++ b/engine/services/model_source_service.cc @@ -433,8 +433,7 @@ cpp::result ModelSourceService::AddCortexsoRepo( auto author = hub_author; auto model_author = hu::GetModelAuthorCortexsoHub(model_name); - if (auto model_author = hu::GetModelAuthorCortexsoHub(model_name); - model_author.has_value() && !model_author.value().empty()) { + if (model_author.has_value() && !model_author.value().empty()) { author = model_author.value(); } diff --git a/engine/test/components/test_engine_matcher_utils.cc b/engine/test/components/test_engine_matcher_utils.cc index 1d1ed47a8..2c24a9b6f 100644 --- a/engine/test/components/test_engine_matcher_utils.cc +++ b/engine/test/components/test_engine_matcher_utils.cc @@ -6,125 +6,78 @@ class EngineMatcherUtilsTestSuite : public ::testing::Test { protected: const std::vector cortex_llamacpp_variants{ - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx2.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx512.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-noavx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-vulkan.tar.gz", - "cortex.llamacpp-0.1.43-linux-arm64.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-mac-arm64.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx512.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx-cuda-11-7.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx-cuda-12-0.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-noavx.tar.gz", - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-vulkan.tar.gz", + "llama-b4920-bin-ubuntu-arm64.zip", + "llama-b4920-bin-linux-avx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-avx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-linux-avx-x64.tar.gz", + "llama-b4920-bin-linux-avx2-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-avx2-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-ubuntu-x64.tar.gz", + "llama-b4920-bin-linux-avx512-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-avx512-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-linux-avx512-x64.tar.gz", + "llama-b4920-bin-linux-noavx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-linux-noavx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-linux-noavx-x64.tar.gz", + "llama-b4920-bin-ubuntu-vulkan-x64.tar.gz", + "llama-b4920-bin-macos-arm64.zip", + "llama-b4920-bin-macos-x64.zip", + "llama-b4920-bin-win-avx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-avx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-avx-x64.zip", + "llama-b4920-bin-win-avx2-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-avx2-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-avx2-x64.zip", + "llama-b4920-bin-win-avx512-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-avx512-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-avx512-x64.zip", + "llama-b4920-bin-win-noavx-cuda-cu11.7-x64.tar.gz", + "llama-b4920-bin-win-noavx-cuda-cu12.0-x64.tar.gz", + "llama-b4920-bin-win-noavx-x64.zip", + "llama-b4920-bin-win-vulkan-x64.zip", }; - - const std::vector cortex_tensorrt_variants{ - "cortex.tensorrt-llm-0.0.9-linux-cuda-12-4.tar.gz", - "cortex.tensorrt-llm-0.0.9-windows-cuda-12-4.tar.gz"}; - - const std::vector cortex_onnx_variants{ - "cortex.onnx-0.1.7-windows-amd64.tar.gz"}; }; -TEST_F(EngineMatcherUtilsTestSuite, TestValidateOnnx) { - - { - auto expect_matched_variant = cortex_onnx_variants[0]; - auto result = engine_matcher_utils::ValidateOnnx(cortex_onnx_variants, - "windows", "amd64"); - - EXPECT_EQ(result, expect_matched_variant); - } - - { - // should return an empty variant because no variant matched - auto expect_matched_variant{""}; - auto windows_arm_result = engine_matcher_utils::ValidateOnnx( - cortex_onnx_variants, "windows", "arm"); - auto mac_arm64_result = engine_matcher_utils::ValidateOnnx( - cortex_onnx_variants, "mac", "arm64"); - - EXPECT_EQ(windows_arm_result, expect_matched_variant); - EXPECT_EQ(mac_arm64_result, expect_matched_variant); - } -} - -TEST_F(EngineMatcherUtilsTestSuite, TestValidateTensorrt) { - +TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { { - auto windows_expect_matched_variant{cortex_tensorrt_variants[1]}; - auto linux_expect_matched_variant{cortex_tensorrt_variants[0]}; - auto windows{"windows"}; - auto linux{"linux"}; + auto os{"win"}; + auto cpu_arch{"x64"}; + auto suitable_avx{"avx2"}; auto cuda_version{"12.4"}; - auto windows_result = engine_matcher_utils::ValidateTensorrtLlm( - cortex_tensorrt_variants, windows, cuda_version); - auto linux_result = engine_matcher_utils::ValidateTensorrtLlm( - cortex_tensorrt_variants, linux, cuda_version); - EXPECT_EQ(windows_result, windows_expect_matched_variant); - EXPECT_EQ(linux_result, linux_expect_matched_variant); - } - - { // macos is not supported - auto os = "mac"; - auto cuda_version{"12.4"}; + auto variant = engine_matcher_utils::Validate( + cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - auto result = engine_matcher_utils::ValidateTensorrtLlm( - cortex_tensorrt_variants, os, cuda_version); - EXPECT_EQ(result, ""); + EXPECT_EQ(variant, "llama-b4920-bin-win-avx2-cuda-cu12.0-x64.tar.gz"); } -} -TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { { - auto os{"windows"}; - auto cpu_arch{"amd64"}; - auto suitable_avx{"avx2"}; - auto cuda_version{"12.4"}; + auto os{"mac"}; + auto cpu_arch{"x64"}; + auto suitable_avx{""}; + auto cuda_version{""}; auto variant = engine_matcher_utils::Validate( cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - EXPECT_EQ( - variant, - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2-cuda-12-0.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-macos-x64.zip"); } { auto os{"mac"}; - auto cpu_arch{"amd64"}; + auto cpu_arch{"arm64"}; auto suitable_avx{""}; auto cuda_version{""}; auto variant = engine_matcher_utils::Validate( cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - EXPECT_EQ(variant, "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-macos-arm64.zip"); } { - auto os{"windows"}; - auto cpu_arch{"amd64"}; + auto os{"win"}; + auto cpu_arch{"x64"}; auto suitable_avx{"avx2"}; auto cuda_version{"10"}; @@ -132,8 +85,7 @@ TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); // fallback to no cuda version - EXPECT_EQ(variant, - "cortex.llamacpp-0.1.25-25.08.24-windows-amd64-avx2.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-win-avx2-x64.zip"); } { @@ -145,30 +97,43 @@ TEST_F(EngineMatcherUtilsTestSuite, TestValidate) { auto variant = engine_matcher_utils::Validate( cortex_llamacpp_variants, os, cpu_arch, suitable_avx, cuda_version); - EXPECT_EQ(variant, "cortex.llamacpp-0.1.43-linux-arm64.tar.gz"); + EXPECT_EQ(variant, "llama-b4920-bin-ubuntu-arm64.zip"); } } TEST_F(EngineMatcherUtilsTestSuite, TestGetVersionAndArch) { { - std::string variant = - "cortex.llamacpp-0.1.25-25.08.24-linux-amd64-avx-cuda-11-7.tar.gz"; + std::string variant = "llama-b4920-bin-linux-avx-cuda-cu11.7-x64.tar.gz"; + auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "linux-avx-cuda-cu11.7-x64"); + } + + { + std::string variant = "llama-b4920-bin-ubuntu-arm64.zip"; + auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "ubuntu-arm64"); + } + + { + std::string variant = "llama-b4920-bin-win-avx2-x64.zip"; auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); - EXPECT_EQ(version, "v0.1.25-25.08.24"); - EXPECT_EQ(arch, "linux-amd64-avx-cuda-11-7"); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "win-avx2-x64"); } { - std::string variant = "cortex.llamacpp-0.1.25-windows-amd64-avx2.tar.gz"; + std::string variant = "llama-b4920-bin-macos-x64.tar.gz"; auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); - EXPECT_EQ(version, "v0.1.25"); - EXPECT_EQ(arch, "windows-amd64-avx2"); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "macos-x64"); } { - std::string variant = "cortex.llamacpp-0.1.25-25.08.24-mac-amd64.tar.gz"; + std::string variant = "llama-b4920-bin-ubuntu-vulkan-x64.zip"; auto [version, arch] = engine_matcher_utils::GetVersionAndArch(variant); - EXPECT_EQ(version, "v0.1.25-25.08.24"); - EXPECT_EQ(arch, "mac-amd64"); + EXPECT_EQ(version, "b4920"); + EXPECT_EQ(arch, "ubuntu-vulkan-x64"); } } diff --git a/engine/test/components/test_function_calling.cc b/engine/test/components/test_function_calling.cc deleted file mode 100644 index 7a4810b29..000000000 --- a/engine/test/components/test_function_calling.cc +++ /dev/null @@ -1,157 +0,0 @@ -#include -#include "gtest/gtest.h" -#include "json/json.h" -#include "utils/function_calling/common.h" - -class FunctionCallingUtilsTest : public ::testing::Test { - protected: - std::shared_ptr createTestRequest() { - auto request = std::make_shared(); - (*request)["tools"] = Json::Value(Json::arrayValue); - return request; - } -}; - -TEST_F(FunctionCallingUtilsTest, ReplaceCustomFunctions) { - std::string original = "Test placeholder"; - std::string replacement = "Custom function"; - std::string result = - function_calling_utils::ReplaceCustomFunctions(original, replacement); - EXPECT_EQ(result, "Test Custom function placeholder"); -} - -TEST_F(FunctionCallingUtilsTest, HasTools) { - auto request = createTestRequest(); - EXPECT_FALSE(function_calling_utils::HasTools(request)); - - (*request)["tools"].append(Json::Value()); - EXPECT_TRUE(function_calling_utils::HasTools(request)); - - (*request)["tools"] = "random"; - EXPECT_FALSE(function_calling_utils::HasTools(request)); - - (*request)["tools"] = Json::Value::null; - EXPECT_FALSE(function_calling_utils::HasTools(request)); -} - -TEST_F(FunctionCallingUtilsTest, ProcessTools) { - auto request = createTestRequest(); - Json::Value tool; - tool["type"] = "function"; - tool["function"]["name"] = "test_function"; - tool["function"]["description"] = "Test description"; - (*request)["tools"].append(tool); - - std::string result = function_calling_utils::ProcessTools(request); - EXPECT_TRUE( - result.find("Use the function 'test_function' to: Test description") != - std::string::npos); -} - -TEST_F(FunctionCallingUtilsTest, ParseMultipleFunctionStrings) { - std::string input = - "{\"arg\":\"value1\"}{\"arg\":\"value2\"}"; - Json::Value result = - function_calling_utils::ParseMultipleFunctionStrings(input); - - ASSERT_EQ(result.size(), 2); - EXPECT_EQ(result[0]["function"]["name"].asString(), "func1"); - EXPECT_EQ(result[0]["function"]["arguments"].asString(), - "{\"arg\":\"value1\"}"); - EXPECT_EQ(result[1]["function"]["name"].asString(), "func2"); - EXPECT_EQ(result[1]["function"]["arguments"].asString(), - "{\"arg\":\"value2\"}"); -} - -TEST_F(FunctionCallingUtilsTest, ConvertJsonToFunctionStrings) { - Json::Value jsonArray(Json::arrayValue); - Json::Value function1, function2; - function1["function"]["name"] = "func1"; - function1["function"]["arguments"] = "{\"arg\":\"value1\"}"; - function2["function"]["name"] = "func2"; - function2["function"]["arguments"] = "{\"arg\":\"value2\"}"; - jsonArray.append(function1); - jsonArray.append(function2); - - std::string result = - function_calling_utils::ConvertJsonToFunctionStrings(jsonArray); - EXPECT_EQ(result, - "{\"arg\":\"value1\"}{\"arg\":\"value2\"}"); -} - -TEST_F(FunctionCallingUtilsTest, CreateCustomFunctionsString) { - auto request = createTestRequest(); - Json::Value tool; - tool["type"] = "function"; - tool["function"]["name"] = "test_function"; - tool["function"]["description"] = "Test description"; - (*request)["tools"].append(tool); - - std::string result = - function_calling_utils::CreateCustomFunctionsString(request); - EXPECT_TRUE(result.find("```") != std::string::npos); - EXPECT_TRUE( - result.find("Use the function 'test_function' to: Test description") != - std::string::npos); -} - -TEST_F(FunctionCallingUtilsTest, IsValidToolChoiceFormat) { - Json::Value validTool; - validTool["type"] = "function"; - validTool["function"]["name"] = "test_function"; - EXPECT_TRUE(function_calling_utils::IsValidToolChoiceFormat(validTool)); - - Json::Value invalidTool; - EXPECT_FALSE(function_calling_utils::IsValidToolChoiceFormat(invalidTool)); -} - -TEST_F(FunctionCallingUtilsTest, UpdateMessages) { - auto request = createTestRequest(); - std::string system_prompt = "Original prompt"; - (*request)["messages"] = Json::Value(Json::arrayValue); - - function_calling_utils::UpdateMessages(system_prompt, request); - - ASSERT_TRUE((*request)["messages"].isArray()); - EXPECT_EQ((*request)["messages"][0]["role"].asString(), "system"); - EXPECT_EQ((*request)["messages"][0]["content"].asString(), system_prompt); -} - -TEST_F(FunctionCallingUtilsTest, PreprocessRequest) { - auto request = createTestRequest(); - Json::Value tool; - tool["type"] = "function"; - tool["function"]["name"] = "test_function"; - tool["function"]["description"] = "Test description"; - (*request)["tools"].append(tool); - - function_calling_utils::PreprocessRequest(request); - - ASSERT_TRUE((*request)["messages"].isArray()); - EXPECT_TRUE((*request)["messages"][0]["content"].asString().find( - "Test description") != std::string::npos); -} - -TEST_F(FunctionCallingUtilsTest, PostProcessResponse) { - Json::Value response; - response["choices"] = Json::Value(Json::arrayValue); - Json::Value choice; - choice["message"]["content"] = - "{\"arg\":\"value\"}"; - response["choices"].append(choice); - - function_calling_utils::PostProcessResponse(response); - - EXPECT_EQ(response["choices"][0]["message"]["content"].asString(), ""); - EXPECT_TRUE(response["choices"][0]["message"]["tool_calls"].isArray()); - EXPECT_EQ( - response["choices"][0]["message"]["tool_calls"][0]["function"]["name"] - .asString(), - "test_function"); - EXPECT_EQ(response["choices"][0]["message"]["tool_calls"][0]["function"] - ["arguments"] - .asString(), - "{\"arg\":\"value\"}"); -} \ No newline at end of file diff --git a/engine/test/components/test_github_release_utils.cc b/engine/test/components/test_github_release_utils.cc index ae1e2c7c2..20c14b187 100644 --- a/engine/test/components/test_github_release_utils.cc +++ b/engine/test/components/test_github_release_utils.cc @@ -4,16 +4,16 @@ class GitHubReleaseUtilsTest : public ::testing::Test {}; TEST_F(GitHubReleaseUtilsTest, AbleToGetReleaseByVersion) { - auto version{"v0.1.36"}; + auto version{"b4920"}; auto result = github_release_utils::GetReleaseByVersion( - "menloresearch", "cortex.llamacpp", version); + kMenloOrg, "llama.cpp", version); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result->tag_name, version); } TEST_F(GitHubReleaseUtilsTest, AbleToGetReleaseList) { - auto result = github_release_utils::GetReleases("menloresearch", "cortex.llamacpp"); + auto result = github_release_utils::GetReleases(kMenloOrg, "llama.cpp"); ASSERT_TRUE(result.has_value()); ASSERT_TRUE(result->size() > 0); diff --git a/engine/test/components/test_string_utils.cc b/engine/test/components/test_string_utils.cc index 42211b668..e12046136 100644 --- a/engine/test/components/test_string_utils.cc +++ b/engine/test/components/test_string_utils.cc @@ -288,6 +288,47 @@ TEST_F(StringUtilsTestSuite, LargeInputPerformance) { EXPECT_EQ(RemoveSubstring(large_input, to_remove), ""); } +TEST(LTrimTest, EmptyString) { + std::string s = ""; + LTrim(s); + EXPECT_EQ(s, ""); +} + +TEST(LTrimTest, NoSpaces) { + std::string s = "HelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, LeadingSpaces) { + std::string s = " HelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, LeadingTabs) { + std::string s = "\t\tHelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, LeadingNewlines) { + std::string s = "\n\nHelloWorld"; + LTrim(s); + EXPECT_EQ(s, "HelloWorld"); +} + +TEST(LTrimTest, OnlySpaces) { + std::string s = " "; + LTrim(s); + EXPECT_EQ(s, ""); +} + +TEST(LTrimTest, MixedSpaces) { + std::string s = " \t\nHelloWorld "; + LTrim(s); + EXPECT_EQ(s, "HelloWorld "); +} TEST_F(StringUtilsTestSuite, UrlPaths_SimilarStrings) { std::string str1 = "/v1/threads/{1}/messages/{2}"; diff --git a/engine/utils/cli_selection_utils.h b/engine/utils/cli_selection_utils.h index dca6fe675..487c21e6b 100644 --- a/engine/utils/cli_selection_utils.h +++ b/engine/utils/cli_selection_utils.h @@ -27,13 +27,13 @@ inline void PrintMenu( inline std::optional GetNumericValue(const std::string& sval) { try { - return std::stoi(sval); + return std::stoi(sval); } catch (const std::invalid_argument&) { - // Not a valid number - return std::nullopt; + // Not a valid number + return std::nullopt; } catch (const std::out_of_range&) { - // Number out of range - return std::nullopt; + // Number out of range + return std::nullopt; } } @@ -73,14 +73,16 @@ inline std::optional PrintModelSelection( } // Validate if the selection consists solely of numeric characters - if(!std::all_of(selection.begin(), selection.end(), ::isdigit)){ + if (!std::all_of(selection.begin(), selection.end(), ::isdigit)) { return std::nullopt; } // deal with out of range numeric values std::optional numeric_value = GetNumericValue(selection); - - if (!numeric_value.has_value() || (unsigned) numeric_value.value() > availables.size() || numeric_value.value() < 1) { + + if (!numeric_value.has_value() || + (unsigned)numeric_value.value() > availables.size() || + numeric_value.value() < 1) { return std::nullopt; } @@ -101,13 +103,15 @@ inline std::optional PrintSelection( } // Validate if the selection consists solely of numeric characters - if(!std::all_of(selection.begin(), selection.end(), ::isdigit)){ + if (!std::all_of(selection.begin(), selection.end(), ::isdigit)) { return std::nullopt; } - + // deal with out of range numeric values std::optional numeric_value = GetNumericValue(selection); - if (!numeric_value.has_value() ||(unsigned) numeric_value.value() > options.size() || numeric_value.value() < 1) { + if (!numeric_value.has_value() || + (unsigned)numeric_value.value() > options.size() || + numeric_value.value() < 1) { return std::nullopt; } diff --git a/engine/utils/cuda_toolkit_utils.h b/engine/utils/cuda_toolkit_utils.h index 748af1bd3..e7aadfdd6 100644 --- a/engine/utils/cuda_toolkit_utils.h +++ b/engine/utils/cuda_toolkit_utils.h @@ -7,32 +7,7 @@ inline std::string GetCompatibleCudaToolkitVersion( const std::string& driver_semantic_version, const std::string& os, const std::string& engine) { - if (engine == "cortex.tensorrt-llm") { - // if the engine is cortex.tensorrt-llm, the minimum required CUDA version is 12.4 - if (os == "windows") { - if (semantic_version_utils::CompareSemanticVersion( - driver_semantic_version, "527.41") >= 0) { - return "12.4"; - } else { - throw std::runtime_error( - "GPU driver version not supported. Minimum " - "required driver version is 527.41"); - } - } else if (os == "linux") { - if (semantic_version_utils::CompareSemanticVersion( - driver_semantic_version, "525.60.13") >= 0) { - return "12.4"; - } else { - throw std::runtime_error( - "GPU driver version not supported. Minimum required driver version " - "is 525.60.13"); - } - } else { - throw std::runtime_error("Unsupported OS"); - } - } - - if (os == "windows") { + if (os == "windows" || os == "win") { if (semantic_version_utils::CompareSemanticVersion(driver_semantic_version, "527.41") >= 0) { return "12.4"; @@ -44,7 +19,7 @@ inline std::string GetCompatibleCudaToolkitVersion( "GPU driver version not supported. Minimum " "required driver version is 452.39"); } - } else if (os == "linux") { + } else if (os == "linux" || os == "ubuntu") { if (semantic_version_utils::CompareSemanticVersion(driver_semantic_version, "525.60.13") >= 0) { return "12.4"; diff --git a/engine/utils/dylib_path_manager.cc b/engine/utils/dylib_path_manager.cc index 7c389df06..878620185 100644 --- a/engine/utils/dylib_path_manager.cc +++ b/engine/utils/dylib_path_manager.cc @@ -26,7 +26,7 @@ cpp::result DylibPathManager::RegisterPath( } return cpp::fail("Failed to add DLL directory: " + path.string()); } else { - CTL_DBG("Added DLL directory: " << path.string()); + CTL_INF("Added DLL directory: " << path.string()); } dylib_paths.push_back({path, cookie}); diff --git a/engine/utils/engine_constants.h b/engine/utils/engine_constants.h index 2c5cd1be3..695afb4c5 100644 --- a/engine/utils/engine_constants.h +++ b/engine/utils/engine_constants.h @@ -5,20 +5,23 @@ constexpr const auto kLlamaEngine = "llama-cpp"; constexpr const auto kRemote = "remote"; constexpr const auto kLocal = "local"; +constexpr const auto kLlamaRepo = "llama.cpp"; +constexpr const auto kLlamaLibPath = "./engines/llama.cpp"; +constexpr const auto kLlamaServer = "llama-server"; -constexpr const auto kLlamaRepo = "cortex.llamacpp"; - -constexpr const auto kLlamaLibPath = "./engines/cortex.llamacpp"; +constexpr const auto kMenloOrg = "menloresearch"; +constexpr const auto kGgmlOrg = "ggml-org"; // other constants constexpr auto static kHuggingFaceHost = "huggingface.co"; constexpr auto static kGitHubHost = "api.github.com"; constexpr auto static kCortexFolderName = "cortexcpp"; -constexpr auto static kDefaultGHUserAgent = "menloresearch"; +constexpr auto static kDefaultGHUserAgent = kMenloOrg; -constexpr auto static kWindowsOs = "windows"; +constexpr auto static kWindowsOs = "win"; constexpr auto static kMacOs = "mac"; constexpr auto static kLinuxOs = "linux"; +constexpr auto static kUbuntuOs = "ubuntu"; constexpr auto static kUnsupportedOs = "Unsupported OS"; constexpr auto static kCurlGetTimeout = 10; diff --git a/engine/utils/engine_matcher_utils.h b/engine/utils/engine_matcher_utils.h index 0b0cb26be..1afdd194c 100644 --- a/engine/utils/engine_matcher_utils.h +++ b/engine/utils/engine_matcher_utils.h @@ -7,6 +7,7 @@ #include #include #include "utils/cpuid/cpu_info.h" +#include "utils/engine_constants.h" #include "utils/logging_utils.h" #include "utils/result.hpp" #include "utils/string_utils.h" @@ -24,13 +25,19 @@ inline cpp::result GetVariantFromNameAndVersion( if (engine.empty()) { return cpp::fail("Engine name is empty"); } - auto nv = string_utils::RemoveSubstring(version, "v"); - using namespace string_utils; - auto removed_extension = RemoveSubstring(engine_file_name, ".tar.gz"); - auto version_and_variant = RemoveSubstring(removed_extension, engine + "-"); - - auto variant = RemoveSubstring(version_and_variant, nv + "-"); - return variant; + CTL_DBG("version: " << version); + namespace su = string_utils; + CTL_DBG("engine_file_name: " << engine_file_name); + auto rm_extension_menlo = su::RemoveSubstring(engine_file_name, ".tar.gz"); + auto rm_extension_ggml = su::RemoveSubstring(rm_extension_menlo, ".zip"); + CTL_DBG("removed_extension: " << rm_extension_ggml); + auto version_and_variant = + su::RemoveSubstring(rm_extension_ggml, engine + "-"); + CTL_DBG("version_and_variant: " << version_and_variant); + auto variant = su::RemoveSubstring(version_and_variant, version + "-"); + auto v = su::RemoveSubstring(variant, "llama-bin-"); + CTL_DBG("variant: " << v); + return v; } inline std::string GetSuitableAvxVariant(cortex::cpuid::CpuInfo& cpu_info) { @@ -48,7 +55,7 @@ inline std::string GetSuitableAvxVariant(cortex::cpuid::CpuInfo& cpu_info) { inline std::string GetSuitableCudaVariant( const std::vector& variants, const std::string& cuda_version) { - std::regex cuda_reg("cuda-(\\d+)-(\\d+)"); + std::regex cuda_reg("cuda-cu(\\d+).(\\d+)"); std::smatch match; int requested_major = 0; @@ -141,8 +148,9 @@ inline std::string Validate(const std::vector& variants, const std::string& os, const std::string& cpu_arch, const std::string& suitable_avx, const std::string& cuda_version) { + // CTL_INF(os << " " << cpu_arch); // Early return if the OS is not supported - if (os != "mac" && os != "windows" && os != "linux") { + if (os != kMacOs && os != kWindowsOs && os != kLinuxOs) { return ""; } @@ -150,6 +158,12 @@ inline std::string Validate(const std::vector& variants, std::copy_if(variants.begin(), variants.end(), std::back_inserter(os_and_arch_compatible_list), [&os, &cpu_arch](const std::string& variant) { + // In case of Linux, we need to include ubuntu version also + if (os == kLinuxOs) { + if (variant.find(kUbuntuOs) != std::string::npos && + variant.find(cpu_arch) != std::string::npos) + return true; + } auto os_match = "-" + os; auto cpu_arch_match = "-" + cpu_arch; @@ -157,10 +171,10 @@ inline std::string Validate(const std::vector& variants, variant.find(cpu_arch_match) != std::string::npos; }); - if (os == "mac" && !os_and_arch_compatible_list.empty()) + if (os == kMacOs && !os_and_arch_compatible_list.empty()) return os_and_arch_compatible_list[0]; - if (os == "linux" && cpu_arch == "arm64" && + if (os == kLinuxOs && cpu_arch == "arm64" && !os_and_arch_compatible_list.empty()) { return os_and_arch_compatible_list[0]; } @@ -170,7 +184,14 @@ inline std::string Validate(const std::vector& variants, std::copy_if(os_and_arch_compatible_list.begin(), os_and_arch_compatible_list.end(), std::back_inserter(avx_compatible_list), - [&suitable_avx](const std::string& variant) { + [&os, &cpu_arch, &suitable_avx](const std::string& variant) { + if (os == kLinuxOs && + (suitable_avx == "avx2" || suitable_avx == "avx512" || + cpu_arch == "arm64")) { + if (variant.find(std::string(kUbuntuOs) + "-" + cpu_arch) != + std::string::npos) + return true; + } auto suitable_avx_match = "-" + suitable_avx; return variant.find(suitable_avx_match) != std::string::npos; @@ -185,15 +206,18 @@ inline std::string Validate(const std::vector& variants, inline std::pair GetVersionAndArch( const std::string& file_name) { // Remove the file extension - std::string base = file_name.substr(0, file_name.find("tar") - 1); + std::string b = string_utils::RemoveSubstring(file_name, ".tar.gz"); + std::string base = string_utils::RemoveSubstring(b, ".zip"); size_t arch_pos = 0; - if (base.find("windows") != std::string::npos) { - arch_pos = base.find("-windows"); + if (base.find("win") != std::string::npos) { + arch_pos = base.find("-bin-win"); } else if (base.find("linux") != std::string::npos) { - arch_pos = base.find("-linux"); + arch_pos = base.find("-bin-linux"); + } else if (base.find("ubuntu") != std::string::npos) { + arch_pos = base.find("-bin-ubuntu"); } else { - arch_pos = base.find("-mac"); + arch_pos = base.find("-bin-macos"); } // Extract architecture part @@ -202,6 +226,6 @@ inline std::pair GetVersionAndArch( // Extract version part size_t v_pos = base.find_first_of('-'); auto version = base.substr(v_pos + 1, arch_pos - v_pos - 1); - return std::pair("v" + version, arch); + return std::pair(version, string_utils::RemoveSubstring(arch, "bin-")); } } // namespace engine_matcher_utils diff --git a/engine/utils/function_calling/common.h b/engine/utils/function_calling/common.h index 34a1c9862..953a9964c 100644 --- a/engine/utils/function_calling/common.h +++ b/engine/utils/function_calling/common.h @@ -129,157 +129,4 @@ inline Json::Value ParseJsonString(const std::string& jsonString) { return root; } -inline std::string CreateCustomFunctionsString( - std::shared_ptr request) { - std::string customFunctions = ProcessTools(request); - if (customFunctions.empty()) { - return ""; // No custom functions found - } - - return "```\n" + customFunctions + "```"; -} -inline bool IsValidToolChoiceFormat(const Json::Value& root) { - return root.isObject() && root.isMember("type") && root["type"].isString() && - root["type"].asString() == "function" && root.isMember("function") && - root["function"].isObject() && root["function"].isMember("name") && - root["function"]["name"].isString(); -} -inline void UpdateMessages(std::string& system_prompt, - std::shared_ptr request) { - Json::Value tool_choice = request->get("tool_choice", "auto"); - if (tool_choice.isString() && tool_choice.asString() == "required") { - system_prompt += - "\n\nYou must call a function to answer the user's question."; - } else if (!tool_choice.isString()) { - - system_prompt += - "\n\nNow this is your first priority: You must call the function '" + - tool_choice["function"]["name"].asString() + - "' to answer the user's question."; - } - bool parallel_tool_calls = request->get("parallel_tool_calls", true).asBool(); - if (!parallel_tool_calls) { - system_prompt += "\n\nNow this is your first priority: You must call the only one function at a time."; - } - - bool tools_call_in_user_message = - request->get("tools_call_in_user_message", false).asBool(); - - bool original_stream_config = (*request).get("stream", false).asBool(); - // (*request)["grammar"] = function_calling_utils::gamma_json; - (*request)["stream"] = - false; //when using function calling, disable stream automatically because we need to parse the response to get function name and params - - if (!request->isMember("messages") || !(*request)["messages"].isArray() || - (*request)["messages"].empty()) { - // If no messages, add the system prompt as the first message - Json::Value systemMessage; - systemMessage["role"] = "system"; - systemMessage["content"] = system_prompt; - (*request)["messages"].append(systemMessage); - } else { - - if (tools_call_in_user_message) { - for (Json::Value& message : (*request)["messages"]) { - if (message["role"] == "user" && message.isMember("tools") && - message["tools"].isArray() && message["tools"].size() > 0) { - message["content"] = system_prompt + "\n User question: " + - message["content"].asString(); - } - } - } else { - Json::Value& firstMessage = (*request)["messages"][0]; - if (firstMessage["role"] == "system") { - bool addCustomPrompt = - request->get("add_custom_system_prompt", true).asBool(); - if (addCustomPrompt) { - firstMessage["content"] = - system_prompt + "\n" + firstMessage["content"].asString(); - } - } else { - // If the first message is not a system message, prepend the system prompt - Json::Value systemMessage; - systemMessage["role"] = "system"; - systemMessage["content"] = system_prompt; - (*request)["messages"].insert(0, systemMessage); - } - } - - // transform last message role to tool if it is a function call - Json::Value& lastMessage = - (*request)["messages"][(*request)["messages"].size() - 1]; - if (lastMessage.get("role", "") == "tool") { - lastMessage["role"] = function_calling_llama3_1_utils::tool_role; - (*request)["stream"] = - original_stream_config; // if role is tool then should restore stream config to original value - } - } - for (Json::Value& message : (*request)["messages"]) { - if (message["role"] == "assistant" && message.isMember("tool_calls")) { - const Json::Value& tool_calls = message["tool_calls"]; - if (!tool_calls.isNull() && tool_calls.isArray() && - tool_calls.size() > 0) { - message["content"] = ConvertJsonToFunctionStrings(tool_calls); - message["tool_calls"] = {}; - } - } - } -} -inline void PreprocessRequest(std::shared_ptr request) { - if (!function_calling_utils::HasTools(request)) { - return; // Exit if no tools present - } - if (request->get("tool_choice", "auto").isString()) { - std::string tool_choice = request->get("tool_choice", "auto").asString(); - if (tool_choice == "none") { - return; // Exit if tool_choice is none - } - } - std::string customFunctionsString = - function_calling_utils::CreateCustomFunctionsString(request); - std::string new_system_prompt = - function_calling_utils::ReplaceCustomFunctions( - function_calling_llama3_1_utils::system_prompt, - customFunctionsString); - UpdateMessages(new_system_prompt, request); -} - -inline void PostProcessResponse(Json::Value& response) { - if (!response.isMember("choices") || !response["choices"].isArray() || - response["choices"].empty()) { - // If there are no choices or the structure is incorrect, do nothing - return; - } - - // Get a reference to the first choice - Json::Value& firstChoice = response["choices"][0]; - - // Check if the choice has a message with content - if (firstChoice.isMember("message") && - firstChoice["message"].isMember("content")) { - std::string content = firstChoice["message"]["content"].asString(); - - // Create a new structure for tool_calls - Json::Value toolCall = ParseMultipleFunctionStrings(content); - if (toolCall.size() > 0) { - // Add tool_calls to the message - if (response.get("tool_choice", "auto").isString()) { - std::string tool_choice = - response.get("tool_choice", "auto").asString(); - if (tool_choice == "auto") { - firstChoice["finish_reason"] = "tool_calls"; - } else { - firstChoice["finish_reason"] = "stop"; - } - } - - firstChoice["message"]["tool_calls"] = toolCall; - - // Clear the content as it's now represented in tool_calls - firstChoice["message"]["content"] = ""; - } - } - - // Add any additional post-processing logic here -} } // namespace function_calling_utils diff --git a/engine/utils/github_release_utils.h b/engine/utils/github_release_utils.h index 29f8a5725..84636903a 100644 --- a/engine/utils/github_release_utils.h +++ b/engine/utils/github_release_utils.h @@ -178,11 +178,6 @@ inline cpp::result GetReleaseByVersion( std::vector path_params{"repos", author, repo, "releases"}; if (tag != "latest") { path_params.push_back("tags"); - - if (!string_utils::StartsWith(tag, "v")) { - path_params.push_back("v" + tag); - } - path_params.push_back(tag); } else { path_params.push_back("latest"); diff --git a/engine/utils/process/utils.cc b/engine/utils/process/utils.cc index f63de5c5e..c9ccddfdf 100644 --- a/engine/utils/process/utils.cc +++ b/engine/utils/process/utils.cc @@ -347,7 +347,7 @@ bool KillProcess(ProcessInfo& proc_info) { bool success; #if defined(_WIN32) - success = TerminateJobObject(proc_info.hJob, 0) == 0; + success = TerminateJobObject(proc_info.hJob, 0); #elif defined(__APPLE__) || defined(__linux__) // we send SIGTERM to subprocess. we trust that this subprocess will // propagate SIGTERM correctly to its children processes. diff --git a/engine/utils/string_utils.h b/engine/utils/string_utils.h index a9ea756b3..e1a567942 100644 --- a/engine/utils/string_utils.h +++ b/engine/utils/string_utils.h @@ -22,6 +22,12 @@ inline std::string RTrim(const std::string& str) { return (end == std::string::npos) ? "" : str.substr(0, end + 1); } +inline void LTrim(std::string& s) { + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { + return !std::isspace(ch); + })); +}; + inline void Trim(std::string& s) { s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { return !std::isspace(ch); diff --git a/engine/utils/system_info_utils.h b/engine/utils/system_info_utils.h index 54eaed8c9..9bef6f4f9 100644 --- a/engine/utils/system_info_utils.h +++ b/engine/utils/system_info_utils.h @@ -70,7 +70,7 @@ inline std::unique_ptr GetSystemInfo() { #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || \ defined(__amd64) || defined(__x86_64) || defined(_M_AMD64) - arch << "amd64"; + arch << "x64"; #elif defined(__arm__) || defined(__arm) || defined(__arm64__) || \ defined(__aarch64__) || defined(__thumb__) || \ defined(__TARGET_ARCH_ARM) || defined(__TARGET_ARCH_THUMB) || \ diff --git a/function-calling.py b/function-calling.py new file mode 100644 index 000000000..32ef31752 --- /dev/null +++ b/function-calling.py @@ -0,0 +1,173 @@ +from datetime import datetime +from openai import OpenAI +from pydantic import BaseModel +import json + +# MODEL = "deepseek-r1-distill-qwen-7b:7b" +MODEL = "llama3.1:8b-q8" + +client = OpenAI( + base_url="http://localhost:39281/v1", + api_key="not-needed", # Authentication is not required for local deployment +) + +tools = [ + { + "type": "function", + "function": { + "name": "puppeteer_navigate", + "description": "Navigate to a URL", + "parameters": { + "properties": {"url": {"type": "string"}}, + "required": ["url"], + "type": "object", + }, + "strict": False, + }, + }, + { + "type": "function", + "function": { + "name": "puppeteer_screenshot", + "description": "Take a screenshot of the current page or a specific element", + "parameters": { + "properties": { + "height": { + "description": "Height in pixels (default: 600)", + "type": "number", + }, + "name": { + "description": "Name for the screenshot", + "type": "string", + }, + "selector": { + "description": "CSS selector for element to screenshot", + "type": "string", + }, + "width": { + "description": "Width in pixels (default: 800)", + "type": "number", + }, + }, + "required": ["name"], + "type": "object", + }, + "strict": False, + }, + }, + { + "type": "function", + "function": { + "name": "puppeteer_click", + "description": "Click an element on the page", + "parameters": { + "properties": { + "selector": { + "description": "CSS selector for element to click", + "type": "string", + } + }, + "required": ["selector"], + "type": "object", + }, + "strict": False, + }, + }, + { + "type": "function", + "function": { + "name": "puppeteer_fill", + "description": "Fill out an input field", + "parameters": { + "properties": { + "selector": { + "description": "CSS selector for input field", + "type": "string", + }, + "value": {"description": "Value to fill", "type": "string"}, + }, + "required": ["selector", "value"], + "type": "object", + }, + "strict": False, + }, + }, + { + "type": "function", + "function": { + "name": "puppeteer_select", + "description": "Select an element on the page with Select tag", + "parameters": { + "properties": { + "selector": { + "description": "CSS selector for element to select", + "type": "string", + }, + "value": {"description": "Value to select", "type": "string"}, + }, + "required": ["selector", "value"], + "type": "object", + }, + "strict": False, + }, + }, + { + "type": "function", + "function": { + "name": "puppeteer_hover", + "description": "Hover an element on the page", + "parameters": { + "properties": { + "selector": { + "description": "CSS selector for element to hover", + "type": "string", + } + }, + "required": ["selector"], + "type": "object", + }, + "strict": False, + }, + }, + { + "type": "function", + "function": { + "name": "puppeteer_evaluate", + "description": "Execute JavaScript in the browser console", + "parameters": { + "properties": { + "script": { + "description": "JavaScript code to execute", + "type": "string", + } + }, + "required": ["script"], + "type": "object", + }, + "strict": False, + }, + }, +] + +completion_payload = { + "messages": [ + { + "role": "system", + "content": 'You have access to the following CUSTOM functions:\n\n\n\nIf a you choose to call a function ONLY reply in the following format:\n<{start_tag}={function_name}>{parameters}{end_tag}\nwhere\n\nstart_tag => ` a JSON dict with the function argument name as key and function argument value as value.\nend_tag => ``\n\nHere is an example,\n{"example_name": "example_value"}\n\nReminder:\n- Function calls MUST follow the specified format\n- Required parameters MUST be specified\n- You can call one or more functions at a time, but remember only chose correct function\n- Put the entire function call reply on one line\n- Always add your sources when using search results to answer the user query\n- If you can not find correct parameters or arguments corresponding to function in the user\'s message, ask user again to provide, do not make assumptions.\n- No explanation are needed when calling a function.\n\nYou are a helpful assistant.', + }, + { + "role": "user", + "content": "go to google search", + }, + ] +} + +response = client.chat.completions.create( + top_p=0.9, + temperature=0.6, + model=MODEL, + messages=completion_payload["messages"], + tools=tools, +) + +print(response) \ No newline at end of file