From 814efb4cf557458957ce01f81569cb9bedda10f2 Mon Sep 17 00:00:00 2001 From: Ignas Baranauskas Date: Thu, 2 Jan 2025 10:30:57 +0000 Subject: [PATCH 01/17] Rename GPU GitHub runner to avoid version confusion (#787) --- .github/workflows/e2e_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index fea42ab6..61781aed 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -21,7 +21,7 @@ env: jobs: kubernetes: - runs-on: ubuntu-20.04-4core-gpu + runs-on: gpu-t4-4-core steps: - name: Checkout code From 757a23c89eebf4868931a8b85982dfcbe0f764d1 Mon Sep 17 00:00:00 2001 From: Ignas Baranauskas Date: Thu, 19 Dec 2024 13:05:04 +0000 Subject: [PATCH 02/17] update GitHub workflows to use Python 3.11 by default --- .../workflows/additional_demo_notebook_tests.yaml | 4 ++-- .github/workflows/coverage-badge.yaml | 4 ++-- .github/workflows/e2e_tests.yaml | 2 +- .github/workflows/guided_notebook_tests.yaml | 6 +++--- .github/workflows/release.yaml | 2 +- .github/workflows/ui_notebooks_test.yaml | 2 +- .github/workflows/unit-tests.yml | 12 ++++++------ 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/additional_demo_notebook_tests.yaml b/.github/workflows/additional_demo_notebook_tests.yaml index 03201e10..a081f9ee 100644 --- a/.github/workflows/additional_demo_notebook_tests.yaml +++ b/.github/workflows/additional_demo_notebook_tests.yaml @@ -50,7 +50,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup and start KinD cluster @@ -168,7 +168,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup and start KinD cluster diff --git a/.github/workflows/coverage-badge.yaml b/.github/workflows/coverage-badge.yaml index af273ce9..e2fdddae 100644 --- a/.github/workflows/coverage-badge.yaml +++ b/.github/workflows/coverage-badge.yaml @@ -13,10 +13,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.9 + - name: Set up Python 3.11 uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.11 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 61781aed..fca6d6e7 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -56,7 +56,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup NVidia GPU environment for KinD diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml index 7a77d5a3..71a82284 100644 --- a/.github/workflows/guided_notebook_tests.yaml +++ b/.github/workflows/guided_notebook_tests.yaml @@ -49,7 +49,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup and start KinD cluster @@ -160,7 +160,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup NVidia GPU environment for KinD @@ -282,7 +282,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup NVidia GPU environment for KinD diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index ddc23b5a..50e3f9e1 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -17,7 +17,7 @@ on: default: 'project-codeflare' python_version: type: string - default: "3.8" + default: "3.11" required: true poetry_version: type: string diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml index 28f7e06c..6d82c49e 100644 --- a/.github/workflows/ui_notebooks_test.yaml +++ b/.github/workflows/ui_notebooks_test.yaml @@ -49,7 +49,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: "3.11" cache: "pip" # caching pip dependencies - name: Setup and start KinD cluster diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 51934628..cbe3603d 100755 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -10,16 +10,16 @@ jobs: unit-tests: runs-on: ubuntu-latest - container: - image: quay.io/project-codeflare/codeflare-sdk-precommit:v0.0.3 steps: - uses: actions/checkout@v4 + + - name: Set up python + uses: actions/setup-python@v5 + with: + python-version: '3.11' - name: Install poetry run: pip install poetry - - uses: actions/setup-python@v5 - with: - python-version: '3.8' - - name: Install dependencies + - name: Install dependencies with poetry run: | poetry config virtualenvs.create false poetry lock --no-update From 0c30442931e70b64e67754f0fd8c0da19977018b Mon Sep 17 00:00:00 2001 From: Ignas Baranauskas Date: Thu, 19 Dec 2024 17:45:14 +0000 Subject: [PATCH 03/17] update unit tests to work on both Python 3.9 and 3.11 --- src/codeflare_sdk/common/kueue/test_kueue.py | 34 +++++++++++-------- .../common/utils/unit_test_support.py | 28 +++++++++++++++ src/codeflare_sdk/ray/cluster/test_config.py | 19 ++++++----- tests/test_cluster_yamls/kueue/aw_kueue.yaml | 4 +-- .../kueue/ray_cluster_kueue.yaml | 4 +-- .../ray/default-appwrapper.yaml | 4 +-- .../ray/default-ray-cluster.yaml | 4 +-- 7 files changed, 67 insertions(+), 30 deletions(-) diff --git a/src/codeflare_sdk/common/kueue/test_kueue.py b/src/codeflare_sdk/common/kueue/test_kueue.py index 77095d4d..0093058c 100644 --- a/src/codeflare_sdk/common/kueue/test_kueue.py +++ b/src/codeflare_sdk/common/kueue/test_kueue.py @@ -11,7 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ..utils.unit_test_support import get_local_queue, createClusterConfig +from ..utils.unit_test_support import ( + apply_template, + get_local_queue, + createClusterConfig, + get_template_variables, +) from unittest.mock import patch from codeflare_sdk.ray.cluster.cluster import Cluster, ClusterConfiguration import yaml @@ -52,21 +57,21 @@ def test_cluster_creation_no_aw_local_queue(mocker): config.local_queue = "local-queue-default" cluster = Cluster(config) assert cluster.resource_yaml == f"{aw_dir}unit-test-cluster-kueue.yaml" - assert filecmp.cmp( - f"{aw_dir}unit-test-cluster-kueue.yaml", + expected_rc = apply_template( f"{parent}/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml", - shallow=True, + get_template_variables(), ) + with open(f"{aw_dir}unit-test-cluster-kueue.yaml", "r") as f: + cluster_kueue = yaml.load(f, Loader=yaml.FullLoader) + assert cluster_kueue == expected_rc + # With resources loaded in memory, no Local Queue specified. config = createClusterConfig() config.name = "unit-test-cluster-kueue" config.write_to_file = False cluster = Cluster(config) - - with open(f"{parent}/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml") as f: - expected_rc = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_rc + assert cluster.resource_yaml == expected_rc def test_aw_creation_local_queue(mocker): @@ -86,12 +91,15 @@ def test_aw_creation_local_queue(mocker): config.local_queue = "local-queue-default" cluster = Cluster(config) assert cluster.resource_yaml == f"{aw_dir}unit-test-aw-kueue.yaml" - assert filecmp.cmp( - f"{aw_dir}unit-test-aw-kueue.yaml", + expected_rc = apply_template( f"{parent}/tests/test_cluster_yamls/kueue/aw_kueue.yaml", - shallow=True, + get_template_variables(), ) + with open(f"{aw_dir}unit-test-aw-kueue.yaml", "r") as f: + aw_kueue = yaml.load(f, Loader=yaml.FullLoader) + assert aw_kueue == expected_rc + # With resources loaded in memory, no Local Queue specified. config = createClusterConfig() config.name = "unit-test-aw-kueue" @@ -99,9 +107,7 @@ def test_aw_creation_local_queue(mocker): config.write_to_file = False cluster = Cluster(config) - with open(f"{parent}/tests/test_cluster_yamls/kueue/aw_kueue.yaml") as f: - expected_rc = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_rc + assert cluster.resource_yaml == expected_rc def test_get_local_queue_exists_fail(mocker): diff --git a/src/codeflare_sdk/common/utils/unit_test_support.py b/src/codeflare_sdk/common/utils/unit_test_support.py index 8e034378..4182fc93 100644 --- a/src/codeflare_sdk/common/utils/unit_test_support.py +++ b/src/codeflare_sdk/common/utils/unit_test_support.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import string +import sys from codeflare_sdk.ray.cluster.cluster import ( Cluster, ClusterConfiguration, @@ -255,6 +257,32 @@ def arg_check_del_effect(group, version, namespace, plural, name, *args): assert name == "ray-dashboard-unit-test-cluster-ray" +def apply_template(yaml_file_path, variables): + with open(yaml_file_path, "r") as file: + yaml_content = file.read() + + # Create a Template instance and substitute the variables + template = string.Template(yaml_content) + filled_yaml = template.substitute(variables) + + # Now load the filled YAML into a Python object + return yaml.load(filled_yaml, Loader=yaml.FullLoader) + + +def get_expected_image(): + python_version = sys.version_info + if python_version.major == 3 and python_version.minor == 9: + return "quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06" + else: + return "quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4" + + +def get_template_variables(): + return { + "image": get_expected_image(), + } + + def arg_check_apply_effect(group, version, namespace, plural, body, *args): assert namespace == "ns" assert args == tuple() diff --git a/src/codeflare_sdk/ray/cluster/test_config.py b/src/codeflare_sdk/ray/cluster/test_config.py index 3416fc28..b16cbaf2 100644 --- a/src/codeflare_sdk/ray/cluster/test_config.py +++ b/src/codeflare_sdk/ray/cluster/test_config.py @@ -13,15 +13,15 @@ # limitations under the License. from codeflare_sdk.common.utils.unit_test_support import ( + apply_template, createClusterWrongType, - get_local_queue, create_cluster_all_config_params, + get_template_variables, ) from codeflare_sdk.ray.cluster.cluster import ClusterConfiguration, Cluster from pathlib import Path import filecmp import pytest -import yaml import os parent = Path(__file__).resolve().parents[4] # project directory @@ -36,9 +36,11 @@ def test_default_cluster_creation(mocker): cluster = Cluster(ClusterConfiguration(name="default-cluster", namespace="ns")) - with open(f"{expected_clusters_dir}/ray/default-ray-cluster.yaml") as f: - expected_rc = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_rc + expected_rc = apply_template( + f"{expected_clusters_dir}/ray/default-ray-cluster.yaml", + get_template_variables(), + ) + assert cluster.resource_yaml == expected_rc def test_default_appwrapper_creation(mocker): @@ -50,9 +52,10 @@ def test_default_appwrapper_creation(mocker): ClusterConfiguration(name="default-appwrapper", namespace="ns", appwrapper=True) ) - with open(f"{expected_clusters_dir}/ray/default-appwrapper.yaml") as f: - expected_aw = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_aw + expected_aw = apply_template( + f"{expected_clusters_dir}/ray/default-appwrapper.yaml", get_template_variables() + ) + assert cluster.resource_yaml == expected_aw def test_config_creation_all_parameters(mocker): diff --git a/tests/test_cluster_yamls/kueue/aw_kueue.yaml b/tests/test_cluster_yamls/kueue/aw_kueue.yaml index 402ffb6a..b5b5ae3f 100644 --- a/tests/test_cluster_yamls/kueue/aw_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/aw_kueue.yaml @@ -38,7 +38,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -103,7 +103,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: diff --git a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml index a5cb3616..ad179a0b 100644 --- a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml @@ -38,7 +38,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -103,7 +103,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: diff --git a/tests/test_cluster_yamls/ray/default-appwrapper.yaml b/tests/test_cluster_yamls/ray/default-appwrapper.yaml index 3e97474d..eadfeaa0 100644 --- a/tests/test_cluster_yamls/ray/default-appwrapper.yaml +++ b/tests/test_cluster_yamls/ray/default-appwrapper.yaml @@ -36,7 +36,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -101,7 +101,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: diff --git a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml index 34de53d2..056479e6 100644 --- a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml +++ b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml @@ -28,7 +28,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -93,7 +93,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: From 36f578d006f45fd21d6c3fd5c536c49087be72f3 Mon Sep 17 00:00:00 2001 From: Ignas Baranauskas Date: Thu, 19 Dec 2024 19:21:02 +0000 Subject: [PATCH 04/17] update contributing guide and docs to recommend Python 3.11 --- CONTRIBUTING.md | 2 +- docs/sphinx/user-docs/e2e.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 884632da..8a87bad4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,7 +6,7 @@ Thank you for your interest in contributing to the CodeFlare SDK! ### Prerequisites -- Python 3.9 +- Python 3.11 - [Poetry](https://python-poetry.org/) ### Setting Up Your Development Environment diff --git a/docs/sphinx/user-docs/e2e.rst b/docs/sphinx/user-docs/e2e.rst index 846536f1..6f3d1462 100644 --- a/docs/sphinx/user-docs/e2e.rst +++ b/docs/sphinx/user-docs/e2e.rst @@ -4,7 +4,7 @@ Running e2e tests locally Pre-requisites ^^^^^^^^^^^^^^ -- We recommend using Python 3.9, along with Poetry. +- We recommend using Python 3.11, along with Poetry. On KinD clusters ---------------- From 7cb6631ffa77c5cdbb7a55eb6f3b0c309ae02aed Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 2 Jan 2025 11:21:43 +0000 Subject: [PATCH 05/17] Updated coverage.svg --- coverage.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coverage.svg b/coverage.svg index 59d64b37..c1490035 100644 --- a/coverage.svg +++ b/coverage.svg @@ -15,7 +15,7 @@ coverage coverage - 93% - 93% + 90% + 90% From 1225d3888f1090acb17270d92fd8118cca4ab63a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 12:11:44 +0000 Subject: [PATCH 06/17] build(deps): bump jinja2 from 3.1.4 to 3.1.5 Bumps [jinja2](https://github.com/pallets/jinja) from 3.1.4 to 3.1.5. - [Release notes](https://github.com/pallets/jinja/releases) - [Changelog](https://github.com/pallets/jinja/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/jinja/compare/3.1.4...3.1.5) --- updated-dependencies: - dependency-name: jinja2 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2c9b713e..88224764 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1457,13 +1457,13 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] [[package]] name = "jinja2" -version = "3.1.4" +version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, - {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, + {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, + {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, ] [package.dependencies] From 28ef1380d75feede4e19030ebe6145c32908be7b Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Thu, 2 Jan 2025 13:42:50 +0100 Subject: [PATCH 07/17] Rename GPU GitHub runner to avoid version confusion --- .github/workflows/guided_notebook_tests.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml index 71a82284..dc44a4bf 100644 --- a/.github/workflows/guided_notebook_tests.yaml +++ b/.github/workflows/guided_notebook_tests.yaml @@ -125,7 +125,7 @@ jobs: verify-1_cluster_job_client: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} - runs-on: ubuntu-20.04-4core-gpu + runs-on: gpu-t4-4-core steps: - name: Checkout code @@ -247,7 +247,7 @@ jobs: verify-2_basic_interactive: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} - runs-on: ubuntu-20.04-4core-gpu + runs-on: gpu-t4-4-core steps: - name: Checkout code From 6936c2ccfbd7e3fd1aa300e834c89307112101dc Mon Sep 17 00:00:00 2001 From: ChristianZaccaria Date: Thu, 2 Jan 2025 12:55:20 +0000 Subject: [PATCH 08/17] Run workflow on labeling a pull request --- .github/workflows/guided_notebook_tests.yaml | 1 + .github/workflows/ui_notebooks_test.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml index dc44a4bf..5c241615 100644 --- a/.github/workflows/guided_notebook_tests.yaml +++ b/.github/workflows/guided_notebook_tests.yaml @@ -3,6 +3,7 @@ name: Guided notebooks tests on: pull_request: branches: [ main ] + types: [ labeled ] concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml index 6d82c49e..d18d0e61 100644 --- a/.github/workflows/ui_notebooks_test.yaml +++ b/.github/workflows/ui_notebooks_test.yaml @@ -3,6 +3,7 @@ name: UI notebooks tests on: pull_request: branches: [ main ] + types: [ labeled ] concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} From 77e79524930296e007e7e11e10114331046d6eff Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Thu, 19 Dec 2024 12:03:44 +0000 Subject: [PATCH 09/17] feat: add annotations parameter to cluster configuration --- src/codeflare_sdk/ray/cluster/build_ray_cluster.py | 7 +++---- src/codeflare_sdk/ray/cluster/config.py | 3 +++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py index e590d483..11171338 100644 --- a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py +++ b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py @@ -191,7 +191,7 @@ def get_metadata(cluster: "codeflare_sdk.ray.cluster.Cluster"): ) # Get the NB annotation if it exists - could be useful in future for a "annotations" parameter. - annotations = get_nb_annotations() + annotations = with_nb_annotations(cluster.config.annotations) if annotations != {}: object_meta.annotations = annotations # As annotations are not a guarantee they are appended to the metadata after creation. return object_meta @@ -213,11 +213,10 @@ def get_labels(cluster: "codeflare_sdk.ray.cluster.Cluster"): return labels -def get_nb_annotations(): +def with_nb_annotations(annotations: dict): """ - The get_nb_annotations() function generates the annotation for NB Prefix if the SDK is running in a notebook + The with_nb_annotations() function generates the annotation for NB Prefix if the SDK is running in a notebook and appends any user set annotations """ - annotations = {} # Notebook annotation nb_prefix = os.environ.get("NB_PREFIX") diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py index b8b097f8..e3069029 100644 --- a/src/codeflare_sdk/ray/cluster/config.py +++ b/src/codeflare_sdk/ray/cluster/config.py @@ -89,6 +89,8 @@ class ClusterConfiguration: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping. + annotations: + A dictionary of annotations to apply to the cluster. """ name: str @@ -126,6 +128,7 @@ class ClusterConfiguration: extended_resource_mapping: Dict[str, str] = field(default_factory=dict) overwrite_default_resource_mapping: bool = False local_queue: Optional[str] = None + annotations: Dict[str, str] = field(default_factory=dict) def __post_init__(self): if not self.verify_tls: From 2fc94881c9970362bd3b12d968fac31ec5902729 Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Thu, 19 Dec 2024 12:05:16 +0000 Subject: [PATCH 10/17] test: add unit test for annotations parameter --- src/codeflare_sdk/common/utils/unit_test_support.py | 1 + src/codeflare_sdk/ray/cluster/test_config.py | 5 +++++ .../test_cluster_yamls/appwrapper/unit-test-all-params.yaml | 2 ++ tests/test_cluster_yamls/ray/unit-test-all-params.yaml | 2 ++ 4 files changed, 10 insertions(+) diff --git a/src/codeflare_sdk/common/utils/unit_test_support.py b/src/codeflare_sdk/common/utils/unit_test_support.py index 4182fc93..388b6b4e 100644 --- a/src/codeflare_sdk/common/utils/unit_test_support.py +++ b/src/codeflare_sdk/common/utils/unit_test_support.py @@ -442,5 +442,6 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu extended_resource_mapping={"example.com/gpu": "GPU", "intel.com/gpu": "TPU"}, overwrite_default_resource_mapping=True, local_queue="local-queue-default", + annotations={"key1": "value1", "key2": "value2"}, ) return Cluster(config) diff --git a/src/codeflare_sdk/ray/cluster/test_config.py b/src/codeflare_sdk/ray/cluster/test_config.py index b16cbaf2..aafb23dd 100644 --- a/src/codeflare_sdk/ray/cluster/test_config.py +++ b/src/codeflare_sdk/ray/cluster/test_config.py @@ -93,6 +93,11 @@ def test_config_creation_all_parameters(mocker): ) assert cluster.config.overwrite_default_resource_mapping == True assert cluster.config.local_queue == "local-queue-default" + assert cluster.config.annotations == { + "app.kubernetes.io/managed-by": "test-prefix", + "key1": "value1", + "key2": "value2", + } assert filecmp.cmp( f"{aw_dir}test-all-params.yaml", diff --git a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml index 6d2c5440..36d186d1 100644 --- a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml @@ -13,6 +13,8 @@ spec: metadata: annotations: app.kubernetes.io/managed-by: test-prefix + key1: value1 + key2: value2 labels: controller-tools.k8s.io: '1.0' key1: value1 diff --git a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml index 8426eede..a6514bd8 100644 --- a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml @@ -3,6 +3,8 @@ kind: RayCluster metadata: annotations: app.kubernetes.io/managed-by: test-prefix + key1: value1 + key2: value2 labels: controller-tools.k8s.io: '1.0' key1: value1 From 22a24d8769421c1130740289a3998a880ce1f48b Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Thu, 19 Dec 2024 12:07:26 +0000 Subject: [PATCH 11/17] docs: update user docs with annotations parameter example --- docs/sphinx/user-docs/cluster-configuration.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst index dc3f2cf4..411760f8 100644 --- a/docs/sphinx/user-docs/cluster-configuration.rst +++ b/docs/sphinx/user-docs/cluster-configuration.rst @@ -26,6 +26,7 @@ requirements for creating the Ray Cluster. worker_memory_limits=2, # Default 2 # image="", # Optional Field labels={"exampleLabel": "example", "secondLabel": "example"}, + annotations={"key1":"value1", "key2":"value2"}, )) .. note:: From 7fef14b83469fd9e65070aca705b29b094543e53 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Fri, 3 Jan 2025 12:12:33 +0100 Subject: [PATCH 12/17] Migrate GitHub runners to newest Ubuntu --- .github/workflows/additional_demo_notebook_tests.yaml | 4 ++-- .github/workflows/guided_notebook_tests.yaml | 2 +- .github/workflows/odh-notebooks-sync.yml | 2 +- .github/workflows/ui_notebooks_test.yaml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/additional_demo_notebook_tests.yaml b/.github/workflows/additional_demo_notebook_tests.yaml index a081f9ee..096cb509 100644 --- a/.github/workflows/additional_demo_notebook_tests.yaml +++ b/.github/workflows/additional_demo_notebook_tests.yaml @@ -15,7 +15,7 @@ env: jobs: verify-local_interactive: if: ${{ github.event.label.name == 'test-additional-notebooks' }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code @@ -133,7 +133,7 @@ jobs: verify-ray_job_client: if: ${{ github.event.label.name == 'test-additional-notebooks' }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml index 5c241615..3309c6a1 100644 --- a/.github/workflows/guided_notebook_tests.yaml +++ b/.github/workflows/guided_notebook_tests.yaml @@ -15,7 +15,7 @@ env: jobs: verify-0_basic_ray: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code diff --git a/.github/workflows/odh-notebooks-sync.yml b/.github/workflows/odh-notebooks-sync.yml index f0853bff..c2ae6d3a 100644 --- a/.github/workflows/odh-notebooks-sync.yml +++ b/.github/workflows/odh-notebooks-sync.yml @@ -37,7 +37,7 @@ env: jobs: build: - runs-on: ubuntu-22.04-8core + runs-on: ubuntu-latest-8core steps: - name: Clone repository and Sync run: | diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml index d18d0e61..1b5ad524 100644 --- a/.github/workflows/ui_notebooks_test.yaml +++ b/.github/workflows/ui_notebooks_test.yaml @@ -15,7 +15,7 @@ env: jobs: verify-3_widget_example: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') || contains(github.event.pull_request.labels.*.name, 'test-ui-notebooks') }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code From 6b0a3ccee5acdaf2541ea8f034c58dbcab559a70 Mon Sep 17 00:00:00 2001 From: Ignas Baranauskas Date: Mon, 6 Jan 2025 10:05:49 +0000 Subject: [PATCH 13/17] fix: remove --no-update as it is the default in poetry 2.0 --- .github/workflows/coverage-badge.yaml | 2 +- .github/workflows/unit-tests.yml | 2 +- tests/e2e/install-codeflare-sdk.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/coverage-badge.yaml b/.github/workflows/coverage-badge.yaml index e2fdddae..0d2d41f6 100644 --- a/.github/workflows/coverage-badge.yaml +++ b/.github/workflows/coverage-badge.yaml @@ -22,7 +22,7 @@ jobs: python -m pip install --upgrade pip pip install poetry poetry config virtualenvs.create false - poetry lock --no-update + poetry lock poetry install --with test - name: Generate coverage report run: | diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index cbe3603d..6697fc80 100755 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -22,7 +22,7 @@ jobs: - name: Install dependencies with poetry run: | poetry config virtualenvs.create false - poetry lock --no-update + poetry lock poetry install --with test - name: Test with pytest and check coverage run: | diff --git a/tests/e2e/install-codeflare-sdk.sh b/tests/e2e/install-codeflare-sdk.sh index e7808582..8ec5e1e6 100644 --- a/tests/e2e/install-codeflare-sdk.sh +++ b/tests/e2e/install-codeflare-sdk.sh @@ -9,7 +9,7 @@ poetry config virtualenvs.create false cd codeflare-sdk # Lock dependencies and install them -poetry lock --no-update +poetry lock poetry install --with test,docs # Return to the workdir From ca834c7f7dcf61d4a6acb495a1aea20128b2d5bd Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Thu, 21 Nov 2024 14:17:07 +0000 Subject: [PATCH 14/17] feat: add custom volumes/volume mounts for ray clusters --- .../ray/cluster/build_ray_cluster.py | 26 ++++++++++++++++--- src/codeflare_sdk/ray/cluster/config.py | 7 +++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py index 11171338..a08f3f73 100644 --- a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py +++ b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py @@ -249,7 +249,7 @@ def get_pod_spec(cluster: "codeflare_sdk.ray.cluster.Cluster", containers): """ pod_spec = V1PodSpec( containers=containers, - volumes=VOLUMES, + volumes=generate_custom_storage(cluster.config.volumes, VOLUMES), ) if cluster.config.image_pull_secrets != []: pod_spec.image_pull_secrets = generate_image_pull_secrets(cluster) @@ -295,7 +295,9 @@ def get_head_container_spec( cluster.config.head_memory_limits, cluster.config.head_extended_resource_requests, ), - volume_mounts=VOLUME_MOUNTS, + volume_mounts=generate_custom_storage( + cluster.config.volume_mounts, VOLUME_MOUNTS + ), ) if cluster.config.envs != {}: head_container.env = generate_env_vars(cluster) @@ -337,7 +339,9 @@ def get_worker_container_spec( cluster.config.worker_memory_limits, cluster.config.worker_extended_resource_requests, ), - volume_mounts=VOLUME_MOUNTS, + volume_mounts=generate_custom_storage( + cluster.config.volume_mounts, VOLUME_MOUNTS + ), ) if cluster.config.envs != {}: @@ -521,6 +525,22 @@ def wrap_cluster( # Etc. +def generate_custom_storage(provided_storage: list, default_storage: list): + """ + The generate_custom_storage function updates the volumes/volume mounts configs with the default volumes/volume mounts. + """ + storage_list = provided_storage.copy() + + if storage_list == []: + storage_list = default_storage + else: + # We append the list of volumes/volume mounts with the defaults and return the full list + for storage in default_storage: + storage_list.append(storage) + + return storage_list + + def write_to_file(cluster: "codeflare_sdk.ray.cluster.Cluster", resource: dict): """ The write_to_file function writes the built Ray Cluster/AppWrapper dict as a yaml file in the .codeflare folder diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py index e3069029..7a78e730 100644 --- a/src/codeflare_sdk/ray/cluster/config.py +++ b/src/codeflare_sdk/ray/cluster/config.py @@ -22,6 +22,7 @@ import warnings from dataclasses import dataclass, field, fields from typing import Dict, List, Optional, Union, get_args, get_origin +from kubernetes.client import V1Volume, V1VolumeMount dir = pathlib.Path(__file__).parent.parent.resolve() @@ -91,6 +92,10 @@ class ClusterConfiguration: A boolean indicating whether to overwrite the default resource mapping. annotations: A dictionary of annotations to apply to the cluster. + volumes: + A list of V1Volume objects to add to the Cluster + volume_mounts: + A list of V1VolumeMount objects to add to the Cluster """ name: str @@ -129,6 +134,8 @@ class ClusterConfiguration: overwrite_default_resource_mapping: bool = False local_queue: Optional[str] = None annotations: Dict[str, str] = field(default_factory=dict) + volumes: list[V1Volume] = field(default_factory=list) + volume_mounts: list[V1VolumeMount] = field(default_factory=list) def __post_init__(self): if not self.verify_tls: From 7e5f896681611335c4e791d83474c4e8696ffca6 Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Thu, 21 Nov 2024 14:50:03 +0000 Subject: [PATCH 15/17] test: add volumes/volume mounts tests --- .../common/utils/unit_test_support.py | 42 +++++++++++++++++++ src/codeflare_sdk/ray/cluster/test_config.py | 4 ++ .../appwrapper/unit-test-all-params.yaml | 36 ++++++++++++++++ .../ray/unit-test-all-params.yaml | 36 ++++++++++++++++ 4 files changed, 118 insertions(+) diff --git a/src/codeflare_sdk/common/utils/unit_test_support.py b/src/codeflare_sdk/common/utils/unit_test_support.py index 388b6b4e..82f301a2 100644 --- a/src/codeflare_sdk/common/utils/unit_test_support.py +++ b/src/codeflare_sdk/common/utils/unit_test_support.py @@ -417,6 +417,7 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), ) + volumes, volume_mounts = get_example_extended_storage_opts() config = ClusterConfiguration( name=cluster_name, @@ -443,5 +444,46 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu overwrite_default_resource_mapping=True, local_queue="local-queue-default", annotations={"key1": "value1", "key2": "value2"}, + volumes=volumes, + volume_mounts=volume_mounts, ) return Cluster(config) + + +def get_example_extended_storage_opts(): + from kubernetes.client import ( + V1Volume, + V1VolumeMount, + V1EmptyDirVolumeSource, + V1ConfigMapVolumeSource, + V1KeyToPath, + V1SecretVolumeSource, + ) + + volume_mounts = [ + V1VolumeMount(mount_path="/home/ray/test1", name="test"), + V1VolumeMount( + mount_path="/home/ray/test2", + name="test2", + ), + V1VolumeMount( + mount_path="/home/ray/test2", + name="test3", + ), + ] + + volumes = [ + V1Volume( + name="test", + empty_dir=V1EmptyDirVolumeSource(size_limit="500Gi"), + ), + V1Volume( + name="test2", + config_map=V1ConfigMapVolumeSource( + name="config-map-test", + items=[V1KeyToPath(key="test", path="/home/ray/test2/data.txt")], + ), + ), + V1Volume(name="test3", secret=V1SecretVolumeSource(secret_name="test-secret")), + ] + return volumes, volume_mounts diff --git a/src/codeflare_sdk/ray/cluster/test_config.py b/src/codeflare_sdk/ray/cluster/test_config.py index aafb23dd..5302e0eb 100644 --- a/src/codeflare_sdk/ray/cluster/test_config.py +++ b/src/codeflare_sdk/ray/cluster/test_config.py @@ -15,6 +15,7 @@ from codeflare_sdk.common.utils.unit_test_support import ( apply_template, createClusterWrongType, + get_example_extended_storage_opts, create_cluster_all_config_params, get_template_variables, ) @@ -64,6 +65,7 @@ def test_config_creation_all_parameters(mocker): expected_extended_resource_mapping = DEFAULT_RESOURCE_MAPPING expected_extended_resource_mapping.update({"example.com/gpu": "GPU"}) expected_extended_resource_mapping["intel.com/gpu"] = "TPU" + volumes, volume_mounts = get_example_extended_storage_opts() cluster = create_cluster_all_config_params(mocker, "test-all-params", False) assert cluster.config.name == "test-all-params" and cluster.config.namespace == "ns" @@ -98,6 +100,8 @@ def test_config_creation_all_parameters(mocker): "key1": "value1", "key2": "value2", } + assert cluster.config.volumes == volumes + assert cluster.config.volume_mounts == volume_mounts assert filecmp.cmp( f"{aw_dir}test-all-params.yaml", diff --git a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml index 36d186d1..e0ecc75d 100644 --- a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml @@ -78,6 +78,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -94,6 +100,18 @@ spec: - name: secret1 - name: secret2 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt @@ -146,6 +164,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -162,6 +186,18 @@ spec: - name: secret1 - name: secret2 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt diff --git a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml index a6514bd8..e743e9fe 100644 --- a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml @@ -69,6 +69,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -85,6 +91,18 @@ spec: - name: secret1 - name: secret2 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt @@ -137,6 +155,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -153,6 +177,18 @@ spec: - name: secret1 - name: secret2 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt From 051ee3c470e95d87c88829b81166409a410ece09 Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Thu, 21 Nov 2024 15:20:54 +0000 Subject: [PATCH 16/17] docs: add user docs for custom volumes/volume mounts --- .../user-docs/cluster-configuration.rst | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst index 411760f8..9f9fdddb 100644 --- a/docs/sphinx/user-docs/cluster-configuration.rst +++ b/docs/sphinx/user-docs/cluster-configuration.rst @@ -27,6 +27,8 @@ requirements for creating the Ray Cluster. # image="", # Optional Field labels={"exampleLabel": "example", "secondLabel": "example"}, annotations={"key1":"value1", "key2":"value2"}, + volumes=[], # See Custom Volumes/Volume Mounts + volume_mounts=[], # See Custom Volumes/Volume Mounts )) .. note:: @@ -49,6 +51,53 @@ apply additional labels to the RayCluster resource. After creating their ``cluster``, a user can call ``cluster.up()`` and ``cluster.down()`` to respectively create or remove the Ray Cluster. +Custom Volumes/Volume Mounts +---------------------------- +| To add custom Volumes and Volume Mounts to your Ray Cluster you need to create two lists ``volumes`` and ``volume_mounts``. The lists consist of ``V1Volume`` and ``V1VolumeMount`` objects respectively. +| Populating these parameters will create Volumes and Volume Mounts for the head and each worker pod. + +.. code:: python + + from kubernetes.client import V1Volume, V1VolumeMount, V1EmptyDirVolumeSource, V1ConfigMapVolumeSource, V1KeyToPath, V1SecretVolumeSource + # In this example we are using the Config Map, EmptyDir and Secret Volume types + volume_mounts_list = [ + V1VolumeMount( + mount_path="/home/ray/test1", + name = "test" + ), + V1VolumeMount( + mount_path = "/home/ray/test2", + name = "test2", + ), + V1VolumeMount( + mount_path = "/home/ray/test3", + name = "test3", + ) + ] + + volumes_list = [ + V1Volume( + name="test", + empty_dir=V1EmptyDirVolumeSource(size_limit="2Gi"), + ), + V1Volume( + name="test2", + config_map=V1ConfigMapVolumeSource( + name="test-config-map", + items=[V1KeyToPath(key="test", path="data.txt")] + ) + ), + V1Volume( + name="test3", + secret=V1SecretVolumeSource( + secret_name="test-secret" + ) + ) + ] + +| For more information on creating Volumes and Volume Mounts with Python check out the Python Kubernetes docs (`Volumes `__, `Volume Mounts `__). +| You can also find further information on Volumes and Volume Mounts by visiting the Kubernetes `documentation `__. + Deprecating Parameters ---------------------- From d89ee5a5320056c131da258e3c61d6de71bb9549 Mon Sep 17 00:00:00 2001 From: Jiri Petrlik Date: Mon, 6 Jan 2025 14:13:45 +0100 Subject: [PATCH 17/17] RHOAIENG-8098 - ClusterConfiguration should support tolerations --- .../common/utils/unit_test_support.py | 11 +++++++++ .../ray/cluster/build_ray_cluster.py | 24 +++++++++++++++---- src/codeflare_sdk/ray/cluster/config.py | 13 ++++++++-- .../appwrapper/unit-test-all-params.yaml | 10 ++++++++ .../ray/unit-test-all-params.yaml | 10 ++++++++ 5 files changed, 62 insertions(+), 6 deletions(-) diff --git a/src/codeflare_sdk/common/utils/unit_test_support.py b/src/codeflare_sdk/common/utils/unit_test_support.py index 82f301a2..373283b8 100644 --- a/src/codeflare_sdk/common/utils/unit_test_support.py +++ b/src/codeflare_sdk/common/utils/unit_test_support.py @@ -22,6 +22,7 @@ import yaml from pathlib import Path from kubernetes import client +from kubernetes.client import V1Toleration from unittest.mock import patch parent = Path(__file__).resolve().parents[4] # project directory @@ -427,8 +428,18 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu head_memory_requests=12, head_memory_limits=16, head_extended_resource_requests={"nvidia.com/gpu": 1, "intel.com/gpu": 2}, + head_tolerations=[ + V1Toleration( + key="key1", operator="Equal", value="value1", effect="NoSchedule" + ) + ], worker_cpu_requests=4, worker_cpu_limits=8, + worker_tolerations=[ + V1Toleration( + key="key2", operator="Equal", value="value2", effect="NoSchedule" + ) + ], num_workers=10, worker_memory_requests=12, worker_memory_limits=16, diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py index a08f3f73..215ac32e 100644 --- a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py +++ b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py @@ -16,7 +16,7 @@ This sub-module exists primarily to be used internally by the Cluster object (in the cluster sub-module) for RayCluster/AppWrapper generation. """ -from typing import Union, Tuple, Dict +from typing import List, Union, Tuple, Dict from ...common import _kube_api_error_handling from ...common.kubernetes_cluster import get_api_client, config_check from kubernetes.client.exceptions import ApiException @@ -40,6 +40,7 @@ V1PodTemplateSpec, V1PodSpec, V1LocalObjectReference, + V1Toleration, ) import yaml @@ -139,7 +140,11 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"): "resources": head_resources, }, "template": { - "spec": get_pod_spec(cluster, [get_head_container_spec(cluster)]) + "spec": get_pod_spec( + cluster, + [get_head_container_spec(cluster)], + cluster.config.head_tolerations, + ) }, }, "workerGroupSpecs": [ @@ -154,7 +159,11 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"): "resources": worker_resources, }, "template": V1PodTemplateSpec( - spec=get_pod_spec(cluster, [get_worker_container_spec(cluster)]) + spec=get_pod_spec( + cluster, + [get_worker_container_spec(cluster)], + cluster.config.worker_tolerations, + ) ), } ], @@ -243,14 +252,21 @@ def update_image(image) -> str: return image -def get_pod_spec(cluster: "codeflare_sdk.ray.cluster.Cluster", containers): +def get_pod_spec( + cluster: "codeflare_sdk.ray.cluster.Cluster", + containers: List, + tolerations: List[V1Toleration], +) -> V1PodSpec: """ The get_pod_spec() function generates a V1PodSpec for the head/worker containers """ + pod_spec = V1PodSpec( containers=containers, volumes=generate_custom_storage(cluster.config.volumes, VOLUMES), + tolerations=tolerations or None, ) + if cluster.config.image_pull_secrets != []: pod_spec.image_pull_secrets = generate_image_pull_secrets(cluster) diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py index 7a78e730..ab64be83 100644 --- a/src/codeflare_sdk/ray/cluster/config.py +++ b/src/codeflare_sdk/ray/cluster/config.py @@ -22,7 +22,7 @@ import warnings from dataclasses import dataclass, field, fields from typing import Dict, List, Optional, Union, get_args, get_origin -from kubernetes.client import V1Volume, V1VolumeMount +from kubernetes.client import V1Toleration, V1Volume, V1VolumeMount dir = pathlib.Path(__file__).parent.parent.resolve() @@ -58,6 +58,8 @@ class ClusterConfiguration: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests) head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1} + head_tolerations: + List of tolerations for head nodes. min_cpus: The minimum number of CPUs to allocate to each worker. max_cpus: @@ -70,6 +72,8 @@ class ClusterConfiguration: The maximum amount of memory to allocate to each worker. num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests) + worker_tolerations: + List of tolerations for worker nodes. appwrapper: A boolean indicating whether to use an AppWrapper. envs: @@ -110,6 +114,7 @@ class ClusterConfiguration: head_extended_resource_requests: Dict[str, Union[str, int]] = field( default_factory=dict ) + head_tolerations: Optional[List[V1Toleration]] = None worker_cpu_requests: Union[int, str] = 1 worker_cpu_limits: Union[int, str] = 1 min_cpus: Optional[Union[int, str]] = None # Deprecating @@ -120,6 +125,7 @@ class ClusterConfiguration: min_memory: Optional[Union[int, str]] = None # Deprecating max_memory: Optional[Union[int, str]] = None # Deprecating num_gpus: Optional[int] = None # Deprecating + worker_tolerations: Optional[List[V1Toleration]] = None appwrapper: bool = False envs: Dict[str, str] = field(default_factory=dict) image: str = "" @@ -272,7 +278,10 @@ def check_type(value, expected_type): if origin_type is Union: return any(check_type(value, union_type) for union_type in args) if origin_type is list: - return all(check_type(elem, args[0]) for elem in value) + if value is not None: + return all(check_type(elem, args[0]) for elem in (value or [])) + else: + return True if origin_type is dict: return all( check_type(k, args[0]) and check_type(v, args[1]) diff --git a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml index e0ecc75d..0977d659 100644 --- a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml @@ -99,6 +99,11 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key1 + operator: Equal + value: value1 volumes: - emptyDir: sizeLimit: 500Gi @@ -185,6 +190,11 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key2 + operator: Equal + value: value2 volumes: - emptyDir: sizeLimit: 500Gi diff --git a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml index e743e9fe..188319ab 100644 --- a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml @@ -90,6 +90,11 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key1 + operator: Equal + value: value1 volumes: - emptyDir: sizeLimit: 500Gi @@ -176,6 +181,11 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key2 + operator: Equal + value: value2 volumes: - emptyDir: sizeLimit: 500Gi