diff --git a/.github/workflows/additional_demo_notebook_tests.yaml b/.github/workflows/additional_demo_notebook_tests.yaml index 03201e10..096cb509 100644 --- a/.github/workflows/additional_demo_notebook_tests.yaml +++ b/.github/workflows/additional_demo_notebook_tests.yaml @@ -15,7 +15,7 @@ env: jobs: verify-local_interactive: if: ${{ github.event.label.name == 'test-additional-notebooks' }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code @@ -50,7 +50,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup and start KinD cluster @@ -133,7 +133,7 @@ jobs: verify-ray_job_client: if: ${{ github.event.label.name == 'test-additional-notebooks' }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code @@ -168,7 +168,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup and start KinD cluster diff --git a/.github/workflows/coverage-badge.yaml b/.github/workflows/coverage-badge.yaml index af273ce9..0d2d41f6 100644 --- a/.github/workflows/coverage-badge.yaml +++ b/.github/workflows/coverage-badge.yaml @@ -13,16 +13,16 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.9 + - name: Set up Python 3.11 uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.11 - name: Install dependencies run: | python -m pip install --upgrade pip pip install poetry poetry config virtualenvs.create false - poetry lock --no-update + poetry lock poetry install --with test - name: Generate coverage report run: | diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index fea42ab6..fca6d6e7 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -21,7 +21,7 @@ env: jobs: kubernetes: - runs-on: ubuntu-20.04-4core-gpu + runs-on: gpu-t4-4-core steps: - name: Checkout code @@ -56,7 +56,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup NVidia GPU environment for KinD diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml index 7a77d5a3..3309c6a1 100644 --- a/.github/workflows/guided_notebook_tests.yaml +++ b/.github/workflows/guided_notebook_tests.yaml @@ -3,6 +3,7 @@ name: Guided notebooks tests on: pull_request: branches: [ main ] + types: [ labeled ] concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} @@ -14,7 +15,7 @@ env: jobs: verify-0_basic_ray: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code @@ -49,7 +50,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup and start KinD cluster @@ -125,7 +126,7 @@ jobs: verify-1_cluster_job_client: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} - runs-on: ubuntu-20.04-4core-gpu + runs-on: gpu-t4-4-core steps: - name: Checkout code @@ -160,7 +161,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup NVidia GPU environment for KinD @@ -247,7 +248,7 @@ jobs: verify-2_basic_interactive: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }} - runs-on: ubuntu-20.04-4core-gpu + runs-on: gpu-t4-4-core steps: - name: Checkout code @@ -282,7 +283,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.11' cache: 'pip' # caching pip dependencies - name: Setup NVidia GPU environment for KinD diff --git a/.github/workflows/odh-notebooks-sync.yml b/.github/workflows/odh-notebooks-sync.yml index f0853bff..c2ae6d3a 100644 --- a/.github/workflows/odh-notebooks-sync.yml +++ b/.github/workflows/odh-notebooks-sync.yml @@ -37,7 +37,7 @@ env: jobs: build: - runs-on: ubuntu-22.04-8core + runs-on: ubuntu-latest-8core steps: - name: Clone repository and Sync run: | diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index ddc23b5a..50e3f9e1 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -17,7 +17,7 @@ on: default: 'project-codeflare' python_version: type: string - default: "3.8" + default: "3.11" required: true poetry_version: type: string diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml index 28f7e06c..1b5ad524 100644 --- a/.github/workflows/ui_notebooks_test.yaml +++ b/.github/workflows/ui_notebooks_test.yaml @@ -3,6 +3,7 @@ name: UI notebooks tests on: pull_request: branches: [ main ] + types: [ labeled ] concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} @@ -14,7 +15,7 @@ env: jobs: verify-3_widget_example: if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') || contains(github.event.pull_request.labels.*.name, 'test-ui-notebooks') }} - runs-on: ubuntu-20.04-4core + runs-on: ubuntu-latest-4core steps: - name: Checkout code @@ -49,7 +50,7 @@ jobs: - name: Set up specific Python version uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: "3.11" cache: "pip" # caching pip dependencies - name: Setup and start KinD cluster diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 51934628..6697fc80 100755 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -10,19 +10,19 @@ jobs: unit-tests: runs-on: ubuntu-latest - container: - image: quay.io/project-codeflare/codeflare-sdk-precommit:v0.0.3 steps: - uses: actions/checkout@v4 + + - name: Set up python + uses: actions/setup-python@v5 + with: + python-version: '3.11' - name: Install poetry run: pip install poetry - - uses: actions/setup-python@v5 - with: - python-version: '3.8' - - name: Install dependencies + - name: Install dependencies with poetry run: | poetry config virtualenvs.create false - poetry lock --no-update + poetry lock poetry install --with test - name: Test with pytest and check coverage run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 884632da..8a87bad4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,7 +6,7 @@ Thank you for your interest in contributing to the CodeFlare SDK! ### Prerequisites -- Python 3.9 +- Python 3.11 - [Poetry](https://python-poetry.org/) ### Setting Up Your Development Environment diff --git a/coverage.svg b/coverage.svg index 59d64b37..c1490035 100644 --- a/coverage.svg +++ b/coverage.svg @@ -15,7 +15,7 @@ coverage coverage - 93% - 93% + 90% + 90% diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst index dc3f2cf4..9f9fdddb 100644 --- a/docs/sphinx/user-docs/cluster-configuration.rst +++ b/docs/sphinx/user-docs/cluster-configuration.rst @@ -26,6 +26,9 @@ requirements for creating the Ray Cluster. worker_memory_limits=2, # Default 2 # image="", # Optional Field labels={"exampleLabel": "example", "secondLabel": "example"}, + annotations={"key1":"value1", "key2":"value2"}, + volumes=[], # See Custom Volumes/Volume Mounts + volume_mounts=[], # See Custom Volumes/Volume Mounts )) .. note:: @@ -48,6 +51,53 @@ apply additional labels to the RayCluster resource. After creating their ``cluster``, a user can call ``cluster.up()`` and ``cluster.down()`` to respectively create or remove the Ray Cluster. +Custom Volumes/Volume Mounts +---------------------------- +| To add custom Volumes and Volume Mounts to your Ray Cluster you need to create two lists ``volumes`` and ``volume_mounts``. The lists consist of ``V1Volume`` and ``V1VolumeMount`` objects respectively. +| Populating these parameters will create Volumes and Volume Mounts for the head and each worker pod. + +.. code:: python + + from kubernetes.client import V1Volume, V1VolumeMount, V1EmptyDirVolumeSource, V1ConfigMapVolumeSource, V1KeyToPath, V1SecretVolumeSource + # In this example we are using the Config Map, EmptyDir and Secret Volume types + volume_mounts_list = [ + V1VolumeMount( + mount_path="/home/ray/test1", + name = "test" + ), + V1VolumeMount( + mount_path = "/home/ray/test2", + name = "test2", + ), + V1VolumeMount( + mount_path = "/home/ray/test3", + name = "test3", + ) + ] + + volumes_list = [ + V1Volume( + name="test", + empty_dir=V1EmptyDirVolumeSource(size_limit="2Gi"), + ), + V1Volume( + name="test2", + config_map=V1ConfigMapVolumeSource( + name="test-config-map", + items=[V1KeyToPath(key="test", path="data.txt")] + ) + ), + V1Volume( + name="test3", + secret=V1SecretVolumeSource( + secret_name="test-secret" + ) + ) + ] + +| For more information on creating Volumes and Volume Mounts with Python check out the Python Kubernetes docs (`Volumes `__, `Volume Mounts `__). +| You can also find further information on Volumes and Volume Mounts by visiting the Kubernetes `documentation `__. + Deprecating Parameters ---------------------- diff --git a/docs/sphinx/user-docs/e2e.rst b/docs/sphinx/user-docs/e2e.rst index 846536f1..6f3d1462 100644 --- a/docs/sphinx/user-docs/e2e.rst +++ b/docs/sphinx/user-docs/e2e.rst @@ -4,7 +4,7 @@ Running e2e tests locally Pre-requisites ^^^^^^^^^^^^^^ -- We recommend using Python 3.9, along with Poetry. +- We recommend using Python 3.11, along with Poetry. On KinD clusters ---------------- diff --git a/poetry.lock b/poetry.lock index 2c9b713e..88224764 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1457,13 +1457,13 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] [[package]] name = "jinja2" -version = "3.1.4" +version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, - {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, + {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, + {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, ] [package.dependencies] diff --git a/src/codeflare_sdk/common/kueue/test_kueue.py b/src/codeflare_sdk/common/kueue/test_kueue.py index 77095d4d..0093058c 100644 --- a/src/codeflare_sdk/common/kueue/test_kueue.py +++ b/src/codeflare_sdk/common/kueue/test_kueue.py @@ -11,7 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ..utils.unit_test_support import get_local_queue, createClusterConfig +from ..utils.unit_test_support import ( + apply_template, + get_local_queue, + createClusterConfig, + get_template_variables, +) from unittest.mock import patch from codeflare_sdk.ray.cluster.cluster import Cluster, ClusterConfiguration import yaml @@ -52,21 +57,21 @@ def test_cluster_creation_no_aw_local_queue(mocker): config.local_queue = "local-queue-default" cluster = Cluster(config) assert cluster.resource_yaml == f"{aw_dir}unit-test-cluster-kueue.yaml" - assert filecmp.cmp( - f"{aw_dir}unit-test-cluster-kueue.yaml", + expected_rc = apply_template( f"{parent}/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml", - shallow=True, + get_template_variables(), ) + with open(f"{aw_dir}unit-test-cluster-kueue.yaml", "r") as f: + cluster_kueue = yaml.load(f, Loader=yaml.FullLoader) + assert cluster_kueue == expected_rc + # With resources loaded in memory, no Local Queue specified. config = createClusterConfig() config.name = "unit-test-cluster-kueue" config.write_to_file = False cluster = Cluster(config) - - with open(f"{parent}/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml") as f: - expected_rc = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_rc + assert cluster.resource_yaml == expected_rc def test_aw_creation_local_queue(mocker): @@ -86,12 +91,15 @@ def test_aw_creation_local_queue(mocker): config.local_queue = "local-queue-default" cluster = Cluster(config) assert cluster.resource_yaml == f"{aw_dir}unit-test-aw-kueue.yaml" - assert filecmp.cmp( - f"{aw_dir}unit-test-aw-kueue.yaml", + expected_rc = apply_template( f"{parent}/tests/test_cluster_yamls/kueue/aw_kueue.yaml", - shallow=True, + get_template_variables(), ) + with open(f"{aw_dir}unit-test-aw-kueue.yaml", "r") as f: + aw_kueue = yaml.load(f, Loader=yaml.FullLoader) + assert aw_kueue == expected_rc + # With resources loaded in memory, no Local Queue specified. config = createClusterConfig() config.name = "unit-test-aw-kueue" @@ -99,9 +107,7 @@ def test_aw_creation_local_queue(mocker): config.write_to_file = False cluster = Cluster(config) - with open(f"{parent}/tests/test_cluster_yamls/kueue/aw_kueue.yaml") as f: - expected_rc = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_rc + assert cluster.resource_yaml == expected_rc def test_get_local_queue_exists_fail(mocker): diff --git a/src/codeflare_sdk/common/utils/unit_test_support.py b/src/codeflare_sdk/common/utils/unit_test_support.py index 8e034378..373283b8 100644 --- a/src/codeflare_sdk/common/utils/unit_test_support.py +++ b/src/codeflare_sdk/common/utils/unit_test_support.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import string +import sys from codeflare_sdk.ray.cluster.cluster import ( Cluster, ClusterConfiguration, @@ -20,6 +22,7 @@ import yaml from pathlib import Path from kubernetes import client +from kubernetes.client import V1Toleration from unittest.mock import patch parent = Path(__file__).resolve().parents[4] # project directory @@ -255,6 +258,32 @@ def arg_check_del_effect(group, version, namespace, plural, name, *args): assert name == "ray-dashboard-unit-test-cluster-ray" +def apply_template(yaml_file_path, variables): + with open(yaml_file_path, "r") as file: + yaml_content = file.read() + + # Create a Template instance and substitute the variables + template = string.Template(yaml_content) + filled_yaml = template.substitute(variables) + + # Now load the filled YAML into a Python object + return yaml.load(filled_yaml, Loader=yaml.FullLoader) + + +def get_expected_image(): + python_version = sys.version_info + if python_version.major == 3 and python_version.minor == 9: + return "quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06" + else: + return "quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4" + + +def get_template_variables(): + return { + "image": get_expected_image(), + } + + def arg_check_apply_effect(group, version, namespace, plural, body, *args): assert namespace == "ns" assert args == tuple() @@ -389,6 +418,7 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), ) + volumes, volume_mounts = get_example_extended_storage_opts() config = ClusterConfiguration( name=cluster_name, @@ -398,8 +428,18 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu head_memory_requests=12, head_memory_limits=16, head_extended_resource_requests={"nvidia.com/gpu": 1, "intel.com/gpu": 2}, + head_tolerations=[ + V1Toleration( + key="key1", operator="Equal", value="value1", effect="NoSchedule" + ) + ], worker_cpu_requests=4, worker_cpu_limits=8, + worker_tolerations=[ + V1Toleration( + key="key2", operator="Equal", value="value2", effect="NoSchedule" + ) + ], num_workers=10, worker_memory_requests=12, worker_memory_limits=16, @@ -414,5 +454,47 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu extended_resource_mapping={"example.com/gpu": "GPU", "intel.com/gpu": "TPU"}, overwrite_default_resource_mapping=True, local_queue="local-queue-default", + annotations={"key1": "value1", "key2": "value2"}, + volumes=volumes, + volume_mounts=volume_mounts, ) return Cluster(config) + + +def get_example_extended_storage_opts(): + from kubernetes.client import ( + V1Volume, + V1VolumeMount, + V1EmptyDirVolumeSource, + V1ConfigMapVolumeSource, + V1KeyToPath, + V1SecretVolumeSource, + ) + + volume_mounts = [ + V1VolumeMount(mount_path="/home/ray/test1", name="test"), + V1VolumeMount( + mount_path="/home/ray/test2", + name="test2", + ), + V1VolumeMount( + mount_path="/home/ray/test2", + name="test3", + ), + ] + + volumes = [ + V1Volume( + name="test", + empty_dir=V1EmptyDirVolumeSource(size_limit="500Gi"), + ), + V1Volume( + name="test2", + config_map=V1ConfigMapVolumeSource( + name="config-map-test", + items=[V1KeyToPath(key="test", path="/home/ray/test2/data.txt")], + ), + ), + V1Volume(name="test3", secret=V1SecretVolumeSource(secret_name="test-secret")), + ] + return volumes, volume_mounts diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py index e590d483..215ac32e 100644 --- a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py +++ b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py @@ -16,7 +16,7 @@ This sub-module exists primarily to be used internally by the Cluster object (in the cluster sub-module) for RayCluster/AppWrapper generation. """ -from typing import Union, Tuple, Dict +from typing import List, Union, Tuple, Dict from ...common import _kube_api_error_handling from ...common.kubernetes_cluster import get_api_client, config_check from kubernetes.client.exceptions import ApiException @@ -40,6 +40,7 @@ V1PodTemplateSpec, V1PodSpec, V1LocalObjectReference, + V1Toleration, ) import yaml @@ -139,7 +140,11 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"): "resources": head_resources, }, "template": { - "spec": get_pod_spec(cluster, [get_head_container_spec(cluster)]) + "spec": get_pod_spec( + cluster, + [get_head_container_spec(cluster)], + cluster.config.head_tolerations, + ) }, }, "workerGroupSpecs": [ @@ -154,7 +159,11 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"): "resources": worker_resources, }, "template": V1PodTemplateSpec( - spec=get_pod_spec(cluster, [get_worker_container_spec(cluster)]) + spec=get_pod_spec( + cluster, + [get_worker_container_spec(cluster)], + cluster.config.worker_tolerations, + ) ), } ], @@ -191,7 +200,7 @@ def get_metadata(cluster: "codeflare_sdk.ray.cluster.Cluster"): ) # Get the NB annotation if it exists - could be useful in future for a "annotations" parameter. - annotations = get_nb_annotations() + annotations = with_nb_annotations(cluster.config.annotations) if annotations != {}: object_meta.annotations = annotations # As annotations are not a guarantee they are appended to the metadata after creation. return object_meta @@ -213,11 +222,10 @@ def get_labels(cluster: "codeflare_sdk.ray.cluster.Cluster"): return labels -def get_nb_annotations(): +def with_nb_annotations(annotations: dict): """ - The get_nb_annotations() function generates the annotation for NB Prefix if the SDK is running in a notebook + The with_nb_annotations() function generates the annotation for NB Prefix if the SDK is running in a notebook and appends any user set annotations """ - annotations = {} # Notebook annotation nb_prefix = os.environ.get("NB_PREFIX") @@ -244,14 +252,21 @@ def update_image(image) -> str: return image -def get_pod_spec(cluster: "codeflare_sdk.ray.cluster.Cluster", containers): +def get_pod_spec( + cluster: "codeflare_sdk.ray.cluster.Cluster", + containers: List, + tolerations: List[V1Toleration], +) -> V1PodSpec: """ The get_pod_spec() function generates a V1PodSpec for the head/worker containers """ + pod_spec = V1PodSpec( containers=containers, - volumes=VOLUMES, + volumes=generate_custom_storage(cluster.config.volumes, VOLUMES), + tolerations=tolerations or None, ) + if cluster.config.image_pull_secrets != []: pod_spec.image_pull_secrets = generate_image_pull_secrets(cluster) @@ -296,7 +311,9 @@ def get_head_container_spec( cluster.config.head_memory_limits, cluster.config.head_extended_resource_requests, ), - volume_mounts=VOLUME_MOUNTS, + volume_mounts=generate_custom_storage( + cluster.config.volume_mounts, VOLUME_MOUNTS + ), ) if cluster.config.envs != {}: head_container.env = generate_env_vars(cluster) @@ -338,7 +355,9 @@ def get_worker_container_spec( cluster.config.worker_memory_limits, cluster.config.worker_extended_resource_requests, ), - volume_mounts=VOLUME_MOUNTS, + volume_mounts=generate_custom_storage( + cluster.config.volume_mounts, VOLUME_MOUNTS + ), ) if cluster.config.envs != {}: @@ -522,6 +541,22 @@ def wrap_cluster( # Etc. +def generate_custom_storage(provided_storage: list, default_storage: list): + """ + The generate_custom_storage function updates the volumes/volume mounts configs with the default volumes/volume mounts. + """ + storage_list = provided_storage.copy() + + if storage_list == []: + storage_list = default_storage + else: + # We append the list of volumes/volume mounts with the defaults and return the full list + for storage in default_storage: + storage_list.append(storage) + + return storage_list + + def write_to_file(cluster: "codeflare_sdk.ray.cluster.Cluster", resource: dict): """ The write_to_file function writes the built Ray Cluster/AppWrapper dict as a yaml file in the .codeflare folder diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py index b8b097f8..ab64be83 100644 --- a/src/codeflare_sdk/ray/cluster/config.py +++ b/src/codeflare_sdk/ray/cluster/config.py @@ -22,6 +22,7 @@ import warnings from dataclasses import dataclass, field, fields from typing import Dict, List, Optional, Union, get_args, get_origin +from kubernetes.client import V1Toleration, V1Volume, V1VolumeMount dir = pathlib.Path(__file__).parent.parent.resolve() @@ -57,6 +58,8 @@ class ClusterConfiguration: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests) head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1} + head_tolerations: + List of tolerations for head nodes. min_cpus: The minimum number of CPUs to allocate to each worker. max_cpus: @@ -69,6 +72,8 @@ class ClusterConfiguration: The maximum amount of memory to allocate to each worker. num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests) + worker_tolerations: + List of tolerations for worker nodes. appwrapper: A boolean indicating whether to use an AppWrapper. envs: @@ -89,6 +94,12 @@ class ClusterConfiguration: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping. + annotations: + A dictionary of annotations to apply to the cluster. + volumes: + A list of V1Volume objects to add to the Cluster + volume_mounts: + A list of V1VolumeMount objects to add to the Cluster """ name: str @@ -103,6 +114,7 @@ class ClusterConfiguration: head_extended_resource_requests: Dict[str, Union[str, int]] = field( default_factory=dict ) + head_tolerations: Optional[List[V1Toleration]] = None worker_cpu_requests: Union[int, str] = 1 worker_cpu_limits: Union[int, str] = 1 min_cpus: Optional[Union[int, str]] = None # Deprecating @@ -113,6 +125,7 @@ class ClusterConfiguration: min_memory: Optional[Union[int, str]] = None # Deprecating max_memory: Optional[Union[int, str]] = None # Deprecating num_gpus: Optional[int] = None # Deprecating + worker_tolerations: Optional[List[V1Toleration]] = None appwrapper: bool = False envs: Dict[str, str] = field(default_factory=dict) image: str = "" @@ -126,6 +139,9 @@ class ClusterConfiguration: extended_resource_mapping: Dict[str, str] = field(default_factory=dict) overwrite_default_resource_mapping: bool = False local_queue: Optional[str] = None + annotations: Dict[str, str] = field(default_factory=dict) + volumes: list[V1Volume] = field(default_factory=list) + volume_mounts: list[V1VolumeMount] = field(default_factory=list) def __post_init__(self): if not self.verify_tls: @@ -262,7 +278,10 @@ def check_type(value, expected_type): if origin_type is Union: return any(check_type(value, union_type) for union_type in args) if origin_type is list: - return all(check_type(elem, args[0]) for elem in value) + if value is not None: + return all(check_type(elem, args[0]) for elem in (value or [])) + else: + return True if origin_type is dict: return all( check_type(k, args[0]) and check_type(v, args[1]) diff --git a/src/codeflare_sdk/ray/cluster/test_config.py b/src/codeflare_sdk/ray/cluster/test_config.py index 3416fc28..5302e0eb 100644 --- a/src/codeflare_sdk/ray/cluster/test_config.py +++ b/src/codeflare_sdk/ray/cluster/test_config.py @@ -13,15 +13,16 @@ # limitations under the License. from codeflare_sdk.common.utils.unit_test_support import ( + apply_template, createClusterWrongType, - get_local_queue, + get_example_extended_storage_opts, create_cluster_all_config_params, + get_template_variables, ) from codeflare_sdk.ray.cluster.cluster import ClusterConfiguration, Cluster from pathlib import Path import filecmp import pytest -import yaml import os parent = Path(__file__).resolve().parents[4] # project directory @@ -36,9 +37,11 @@ def test_default_cluster_creation(mocker): cluster = Cluster(ClusterConfiguration(name="default-cluster", namespace="ns")) - with open(f"{expected_clusters_dir}/ray/default-ray-cluster.yaml") as f: - expected_rc = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_rc + expected_rc = apply_template( + f"{expected_clusters_dir}/ray/default-ray-cluster.yaml", + get_template_variables(), + ) + assert cluster.resource_yaml == expected_rc def test_default_appwrapper_creation(mocker): @@ -50,9 +53,10 @@ def test_default_appwrapper_creation(mocker): ClusterConfiguration(name="default-appwrapper", namespace="ns", appwrapper=True) ) - with open(f"{expected_clusters_dir}/ray/default-appwrapper.yaml") as f: - expected_aw = yaml.load(f, Loader=yaml.FullLoader) - assert cluster.resource_yaml == expected_aw + expected_aw = apply_template( + f"{expected_clusters_dir}/ray/default-appwrapper.yaml", get_template_variables() + ) + assert cluster.resource_yaml == expected_aw def test_config_creation_all_parameters(mocker): @@ -61,6 +65,7 @@ def test_config_creation_all_parameters(mocker): expected_extended_resource_mapping = DEFAULT_RESOURCE_MAPPING expected_extended_resource_mapping.update({"example.com/gpu": "GPU"}) expected_extended_resource_mapping["intel.com/gpu"] = "TPU" + volumes, volume_mounts = get_example_extended_storage_opts() cluster = create_cluster_all_config_params(mocker, "test-all-params", False) assert cluster.config.name == "test-all-params" and cluster.config.namespace == "ns" @@ -90,6 +95,13 @@ def test_config_creation_all_parameters(mocker): ) assert cluster.config.overwrite_default_resource_mapping == True assert cluster.config.local_queue == "local-queue-default" + assert cluster.config.annotations == { + "app.kubernetes.io/managed-by": "test-prefix", + "key1": "value1", + "key2": "value2", + } + assert cluster.config.volumes == volumes + assert cluster.config.volume_mounts == volume_mounts assert filecmp.cmp( f"{aw_dir}test-all-params.yaml", diff --git a/tests/e2e/install-codeflare-sdk.sh b/tests/e2e/install-codeflare-sdk.sh index e7808582..8ec5e1e6 100644 --- a/tests/e2e/install-codeflare-sdk.sh +++ b/tests/e2e/install-codeflare-sdk.sh @@ -9,7 +9,7 @@ poetry config virtualenvs.create false cd codeflare-sdk # Lock dependencies and install them -poetry lock --no-update +poetry lock poetry install --with test,docs # Return to the workdir diff --git a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml index 6d2c5440..0977d659 100644 --- a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml @@ -13,6 +13,8 @@ spec: metadata: annotations: app.kubernetes.io/managed-by: test-prefix + key1: value1 + key2: value2 labels: controller-tools.k8s.io: '1.0' key1: value1 @@ -76,6 +78,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -91,7 +99,24 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key1 + operator: Equal + value: value1 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt @@ -144,6 +169,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -159,7 +190,24 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key2 + operator: Equal + value: value2 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt diff --git a/tests/test_cluster_yamls/kueue/aw_kueue.yaml b/tests/test_cluster_yamls/kueue/aw_kueue.yaml index 402ffb6a..b5b5ae3f 100644 --- a/tests/test_cluster_yamls/kueue/aw_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/aw_kueue.yaml @@ -38,7 +38,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -103,7 +103,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: diff --git a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml index a5cb3616..ad179a0b 100644 --- a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml +++ b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml @@ -38,7 +38,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -103,7 +103,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: diff --git a/tests/test_cluster_yamls/ray/default-appwrapper.yaml b/tests/test_cluster_yamls/ray/default-appwrapper.yaml index 3e97474d..eadfeaa0 100644 --- a/tests/test_cluster_yamls/ray/default-appwrapper.yaml +++ b/tests/test_cluster_yamls/ray/default-appwrapper.yaml @@ -36,7 +36,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -101,7 +101,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: diff --git a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml index 34de53d2..056479e6 100644 --- a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml +++ b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml @@ -28,7 +28,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: @@ -93,7 +93,7 @@ spec: template: spec: containers: - - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06 + - image: "${image}" imagePullPolicy: Always lifecycle: preStop: diff --git a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml index 8426eede..188319ab 100644 --- a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml +++ b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml @@ -3,6 +3,8 @@ kind: RayCluster metadata: annotations: app.kubernetes.io/managed-by: test-prefix + key1: value1 + key2: value2 labels: controller-tools.k8s.io: '1.0' key1: value1 @@ -67,6 +69,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -82,7 +90,24 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key1 + operator: Equal + value: value1 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt @@ -135,6 +160,12 @@ spec: memory: 12G nvidia.com/gpu: 1 volumeMounts: + - mountPath: /home/ray/test1 + name: test + - mountPath: /home/ray/test2 + name: test2 + - mountPath: /home/ray/test2 + name: test3 - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert subPath: odh-trusted-ca-bundle.crt @@ -150,7 +181,24 @@ spec: imagePullSecrets: - name: secret1 - name: secret2 + tolerations: + - effect: NoSchedule + key: key2 + operator: Equal + value: value2 volumes: + - emptyDir: + sizeLimit: 500Gi + name: test + - configMap: + items: + - key: test + path: /home/ray/test2/data.txt + name: config-map-test + name: test2 + - name: test3 + secret: + secretName: test-secret - configMap: items: - key: ca-bundle.crt