instructlab · courtneypacheco · Apr 17, 2025 · Apr 10, 2025
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -134,7 +134,7 @@ jobs:
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
           python3.11 -m pip install -v packaging wheel setuptools-scm
-          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
+          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt -c constraints-dev.txt
 
       - name: Check disk before tests
         run: |

diff --git a/.github/workflows/e2e-nvidia-l40s-x4-llama.yml b/.github/workflows/e2e-nvidia-l40s-x4-llama.yml
@@ -20,28 +20,58 @@ jobs:
   start-large-ec2-runner:
     runs-on: ubuntu-latest
     outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
+      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
+      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
     steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+          repository: instructlab/ci-actions
+          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
+          path: ci-actions
+          ref: release-v0.1
+          sparse-checkout: |
+            actions/launch-ec2-runner-with-fallback
+
+      - name: Launch EC2 Runner with Fallback
+        id: launch-ec2-instance-with-fallback
+        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
+        env:
+          TMPDIR: "/tmp"
         with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
-          ec2-instance-type: g6e.12xlarge
-          subnet-id: subnet-024298cefa3bedd61
-          security-group-id: sg-06300447c4a5fbef3
-          iam-role-name: instructlab-ci-runner
-          aws-resource-tags: >
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          regions_config: >
+            [
+              {
+                "region": "us-east-2",
+                "subnets": {
+                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
+                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
+                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
+              },
+              {
+                "region": "us-east-1",
+                "subnets": {
+                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
+                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
+                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
+                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
+                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
+                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
+              }
+            ]
+          try_spot_instance_first: false
+          ec2_instance_type: g6e.12xlarge
+          aws_resource_tags: >
             [
               {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
               {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
@@ -243,7 +273,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ vars.AWS_REGION }}
+          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -146,7 +146,7 @@ jobs:
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
           python3.11 -m pip install -v packaging wheel setuptools-scm
-          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
+          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt -c constraints-dev.txt
 
       - name: Check disk before tests
         run: |

diff --git a/.github/workflows/e2e-nvidia-l40s-x8.yml b/.github/workflows/e2e-nvidia-l40s-x8.yml
@@ -226,7 +226,7 @@ jobs:
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
           python3.11 -m pip install -v packaging wheel setuptools-scm
-          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
+          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt -c constraints-dev.txt
 
       - name: Check disk before tests
         run: |

diff --git a/constraints-dev.txt b/constraints-dev.txt
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+
+torch<2.7.0
+vllm<0.9.0
diff --git a/requirements-vllm-cuda.txt b/requirements-vllm-cuda.txt
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # Dependencies for installing vLLM on CUDA
 
-# vLLM only supports Linux platform (including WSL)
-vllm==0.7.3 ; sys_platform == 'linux' and platform_machine == 'x86_64'
+# vLLM only supports Linux platform (including WSL). Do not cap this dependency here. Cap in constraints-dev.txt
+vllm>=0.8.0 ; sys_platform == 'linux'
diff --git a/requirements.txt b/requirements.txt
@@ -31,8 +31,8 @@ sentencepiece>=0.2.0
 # "old" version required for vLLM on CUDA to build
 tokenizers>=0.11.1
 toml>=0.10.2
-# Default version. Can be overridden in extra requirements
-torch>=2.3.0,<2.6.0
+# Default version. Can be overridden in extra requirements. Do not cap this dependency here. Cap in constraints-dev.txt
+torch>=2.6.0
 tqdm>=4.66.2
 # temporary cap until https://github.com/instructlab/training/pull/443 is merged and consumed within instructlab
 # above PR fixes interactions with newer versions of transformers through the training library

diff --git a/tests/test_package.py b/tests/test_package.py
@@ -16,7 +16,7 @@
 # special cases
 EXTRA_CHECKS = {
     "hpu": {
-        "torch": Version("2.3.1a0"),
+        "torch": Version("2.6.0"),
         "transformers": Version("4.43.0"),
     }
 }

diff --git a/tox.ini b/tox.ini
@@ -21,7 +21,9 @@ package = wheel
 wheel_build_env = pkg
 # equivalent to `pip install instructlab[cpu]`
 extras = cpu
-deps = -r requirements-dev.txt
+deps =
+    -r requirements-dev.txt
+    -c constraints-dev.txt
 commands =
     ilab --version
     {envpython} -m instructlab --version