diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index e93542e488..a4f59abeab 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -134,7 +134,7 @@ jobs: # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed python3.11 -m pip install -v packaging wheel setuptools-scm - python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt + python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt -c constraints-dev.txt - name: Check disk before tests run: | diff --git a/.github/workflows/e2e-nvidia-l40s-x4-llama.yml b/.github/workflows/e2e-nvidia-l40s-x4-llama.yml index ee924a53fb..a72ab8049a 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4-llama.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4-llama.yml @@ -20,28 +20,58 @@ jobs: start-large-ec2-runner: runs-on: ubuntu-latest outputs: - label: ${{ steps.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} + ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} + ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Start EC2 runner - id: start-ec2-runner - uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + repository: instructlab/ci-actions + # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents + path: ci-actions + ref: release-v0.1 + sparse-checkout: | + actions/launch-ec2-runner-with-fallback + + - name: Launch EC2 Runner with Fallback + id: launch-ec2-instance-with-fallback + uses: ./ci-actions/actions/launch-ec2-runner-with-fallback + env: + TMPDIR: "/tmp" with: - mode: start - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ${{ vars.AWS_EC2_AMI }} - ec2-instance-type: g6e.12xlarge - subnet-id: subnet-024298cefa3bedd61 - security-group-id: sg-06300447c4a5fbef3 - iam-role-name: instructlab-ci-runner - aws-resource-tags: > + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + regions_config: > + [ + { + "region": "us-east-2", + "subnets": { + "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", + "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", + "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" + }, + { + "region": "us-east-1", + "subnets": { + "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", + "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", + "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", + "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", + "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", + "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" + } + ] + try_spot_instance_first: false + ec2_instance_type: g6e.12xlarge + aws_resource_tags: > [ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, @@ -243,7 +273,7 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ vars.AWS_REGION }} + aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }} - name: Stop EC2 runner uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 868b97ddfb..d481820e9d 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -146,7 +146,7 @@ jobs: # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed python3.11 -m pip install -v packaging wheel setuptools-scm - python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt + python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt -c constraints-dev.txt - name: Check disk before tests run: | diff --git a/.github/workflows/e2e-nvidia-l40s-x8.yml b/.github/workflows/e2e-nvidia-l40s-x8.yml index 2baedbad4d..5b69c1282e 100644 --- a/.github/workflows/e2e-nvidia-l40s-x8.yml +++ b/.github/workflows/e2e-nvidia-l40s-x8.yml @@ -226,7 +226,7 @@ jobs: # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed python3.11 -m pip install -v packaging wheel setuptools-scm - python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt + python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt -c constraints-dev.txt - name: Check disk before tests run: | diff --git a/constraints-dev.txt b/constraints-dev.txt new file mode 100644 index 0000000000..9e23019660 --- /dev/null +++ b/constraints-dev.txt @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 + +torch<2.7.0 +vllm<0.9.0 diff --git a/requirements-vllm-cuda.txt b/requirements-vllm-cuda.txt index 9b116c98a2..73c37b67e6 100644 --- a/requirements-vllm-cuda.txt +++ b/requirements-vllm-cuda.txt @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # Dependencies for installing vLLM on CUDA -# vLLM only supports Linux platform (including WSL) -vllm==0.7.3 ; sys_platform == 'linux' and platform_machine == 'x86_64' +# vLLM only supports Linux platform (including WSL). Do not cap this dependency here. Cap in constraints-dev.txt +vllm>=0.8.0 ; sys_platform == 'linux' diff --git a/requirements.txt b/requirements.txt index 0aeb60d988..214b1b786a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,8 +31,8 @@ sentencepiece>=0.2.0 # "old" version required for vLLM on CUDA to build tokenizers>=0.11.1 toml>=0.10.2 -# Default version. Can be overridden in extra requirements -torch>=2.3.0,<2.6.0 +# Default version. Can be overridden in extra requirements. Do not cap this dependency here. Cap in constraints-dev.txt +torch>=2.6.0 tqdm>=4.66.2 # temporary cap until https://github.com/instructlab/training/pull/443 is merged and consumed within instructlab # above PR fixes interactions with newer versions of transformers through the training library diff --git a/tests/test_package.py b/tests/test_package.py index 7b960cff19..05a0fb3164 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -16,7 +16,7 @@ # special cases EXTRA_CHECKS = { "hpu": { - "torch": Version("2.3.1a0"), + "torch": Version("2.6.0"), "transformers": Version("4.43.0"), } } diff --git a/tox.ini b/tox.ini index 3f6323b5a7..7d40c58a2b 100644 --- a/tox.ini +++ b/tox.ini @@ -21,7 +21,9 @@ package = wheel wheel_build_env = pkg # equivalent to `pip install instructlab[cpu]` extras = cpu -deps = -r requirements-dev.txt +deps = + -r requirements-dev.txt + -c constraints-dev.txt commands = ilab --version {envpython} -m instructlab --version