diff --git a/.github/workflows/additional_demo_notebook_tests.yaml b/.github/workflows/additional_demo_notebook_tests.yaml
index 03201e10..096cb509 100644
--- a/.github/workflows/additional_demo_notebook_tests.yaml
+++ b/.github/workflows/additional_demo_notebook_tests.yaml
@@ -15,7 +15,7 @@ env:
jobs:
verify-local_interactive:
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
- runs-on: ubuntu-20.04-4core
+ runs-on: ubuntu-latest-4core
steps:
- name: Checkout code
@@ -50,7 +50,7 @@ jobs:
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
- python-version: '3.9'
+ python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup and start KinD cluster
@@ -133,7 +133,7 @@ jobs:
verify-ray_job_client:
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
- runs-on: ubuntu-20.04-4core
+ runs-on: ubuntu-latest-4core
steps:
- name: Checkout code
@@ -168,7 +168,7 @@ jobs:
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
- python-version: '3.9'
+ python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup and start KinD cluster
diff --git a/.github/workflows/coverage-badge.yaml b/.github/workflows/coverage-badge.yaml
index af273ce9..0d2d41f6 100644
--- a/.github/workflows/coverage-badge.yaml
+++ b/.github/workflows/coverage-badge.yaml
@@ -13,16 +13,16 @@ jobs:
steps:
- uses: actions/checkout@v4
- - name: Set up Python 3.9
+ - name: Set up Python 3.11
uses: actions/setup-python@v5
with:
- python-version: 3.9
+ python-version: 3.11
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install poetry
poetry config virtualenvs.create false
- poetry lock --no-update
+ poetry lock
poetry install --with test
- name: Generate coverage report
run: |
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index fea42ab6..fca6d6e7 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -21,7 +21,7 @@ env:
jobs:
kubernetes:
- runs-on: ubuntu-20.04-4core-gpu
+ runs-on: gpu-t4-4-core
steps:
- name: Checkout code
@@ -56,7 +56,7 @@ jobs:
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
- python-version: '3.9'
+ python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup NVidia GPU environment for KinD
diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml
index 7a77d5a3..3309c6a1 100644
--- a/.github/workflows/guided_notebook_tests.yaml
+++ b/.github/workflows/guided_notebook_tests.yaml
@@ -3,6 +3,7 @@ name: Guided notebooks tests
on:
pull_request:
branches: [ main ]
+ types: [ labeled ]
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -14,7 +15,7 @@ env:
jobs:
verify-0_basic_ray:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
- runs-on: ubuntu-20.04-4core
+ runs-on: ubuntu-latest-4core
steps:
- name: Checkout code
@@ -49,7 +50,7 @@ jobs:
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
- python-version: '3.9'
+ python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup and start KinD cluster
@@ -125,7 +126,7 @@ jobs:
verify-1_cluster_job_client:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
- runs-on: ubuntu-20.04-4core-gpu
+ runs-on: gpu-t4-4-core
steps:
- name: Checkout code
@@ -160,7 +161,7 @@ jobs:
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
- python-version: '3.9'
+ python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup NVidia GPU environment for KinD
@@ -247,7 +248,7 @@ jobs:
verify-2_basic_interactive:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') }}
- runs-on: ubuntu-20.04-4core-gpu
+ runs-on: gpu-t4-4-core
steps:
- name: Checkout code
@@ -282,7 +283,7 @@ jobs:
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
- python-version: '3.9'
+ python-version: '3.11'
cache: 'pip' # caching pip dependencies
- name: Setup NVidia GPU environment for KinD
diff --git a/.github/workflows/odh-notebooks-sync.yml b/.github/workflows/odh-notebooks-sync.yml
index f0853bff..c2ae6d3a 100644
--- a/.github/workflows/odh-notebooks-sync.yml
+++ b/.github/workflows/odh-notebooks-sync.yml
@@ -37,7 +37,7 @@ env:
jobs:
build:
- runs-on: ubuntu-22.04-8core
+ runs-on: ubuntu-latest-8core
steps:
- name: Clone repository and Sync
run: |
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index ddc23b5a..50e3f9e1 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -17,7 +17,7 @@ on:
default: 'project-codeflare'
python_version:
type: string
- default: "3.8"
+ default: "3.11"
required: true
poetry_version:
type: string
diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml
index 28f7e06c..1b5ad524 100644
--- a/.github/workflows/ui_notebooks_test.yaml
+++ b/.github/workflows/ui_notebooks_test.yaml
@@ -3,6 +3,7 @@ name: UI notebooks tests
on:
pull_request:
branches: [ main ]
+ types: [ labeled ]
concurrency:
group: ${{ github.head_ref }}-${{ github.workflow }}
@@ -14,7 +15,7 @@ env:
jobs:
verify-3_widget_example:
if: ${{ contains(github.event.pull_request.labels.*.name, 'test-guided-notebooks') || contains(github.event.pull_request.labels.*.name, 'test-ui-notebooks') }}
- runs-on: ubuntu-20.04-4core
+ runs-on: ubuntu-latest-4core
steps:
- name: Checkout code
@@ -49,7 +50,7 @@ jobs:
- name: Set up specific Python version
uses: actions/setup-python@v5
with:
- python-version: "3.9"
+ python-version: "3.11"
cache: "pip" # caching pip dependencies
- name: Setup and start KinD cluster
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 51934628..6697fc80 100755
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -10,19 +10,19 @@ jobs:
unit-tests:
runs-on: ubuntu-latest
- container:
- image: quay.io/project-codeflare/codeflare-sdk-precommit:v0.0.3
steps:
- uses: actions/checkout@v4
+
+ - name: Set up python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
- name: Install poetry
run: pip install poetry
- - uses: actions/setup-python@v5
- with:
- python-version: '3.8'
- - name: Install dependencies
+ - name: Install dependencies with poetry
run: |
poetry config virtualenvs.create false
- poetry lock --no-update
+ poetry lock
poetry install --with test
- name: Test with pytest and check coverage
run: |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 884632da..8a87bad4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,7 +6,7 @@ Thank you for your interest in contributing to the CodeFlare SDK!
### Prerequisites
-- Python 3.9
+- Python 3.11
- [Poetry](https://python-poetry.org/)
### Setting Up Your Development Environment
diff --git a/coverage.svg b/coverage.svg
index 59d64b37..c1490035 100644
--- a/coverage.svg
+++ b/coverage.svg
@@ -15,7 +15,7 @@
coverage
coverage
- 93%
- 93%
+ 90%
+ 90%
diff --git a/docs/sphinx/user-docs/cluster-configuration.rst b/docs/sphinx/user-docs/cluster-configuration.rst
index dc3f2cf4..9f9fdddb 100644
--- a/docs/sphinx/user-docs/cluster-configuration.rst
+++ b/docs/sphinx/user-docs/cluster-configuration.rst
@@ -26,6 +26,9 @@ requirements for creating the Ray Cluster.
worker_memory_limits=2, # Default 2
# image="", # Optional Field
labels={"exampleLabel": "example", "secondLabel": "example"},
+ annotations={"key1":"value1", "key2":"value2"},
+ volumes=[], # See Custom Volumes/Volume Mounts
+ volume_mounts=[], # See Custom Volumes/Volume Mounts
))
.. note::
@@ -48,6 +51,53 @@ apply additional labels to the RayCluster resource.
After creating their ``cluster``, a user can call ``cluster.up()`` and
``cluster.down()`` to respectively create or remove the Ray Cluster.
+Custom Volumes/Volume Mounts
+----------------------------
+| To add custom Volumes and Volume Mounts to your Ray Cluster you need to create two lists ``volumes`` and ``volume_mounts``. The lists consist of ``V1Volume`` and ``V1VolumeMount`` objects respectively.
+| Populating these parameters will create Volumes and Volume Mounts for the head and each worker pod.
+
+.. code:: python
+
+ from kubernetes.client import V1Volume, V1VolumeMount, V1EmptyDirVolumeSource, V1ConfigMapVolumeSource, V1KeyToPath, V1SecretVolumeSource
+ # In this example we are using the Config Map, EmptyDir and Secret Volume types
+ volume_mounts_list = [
+ V1VolumeMount(
+ mount_path="/home/ray/test1",
+ name = "test"
+ ),
+ V1VolumeMount(
+ mount_path = "/home/ray/test2",
+ name = "test2",
+ ),
+ V1VolumeMount(
+ mount_path = "/home/ray/test3",
+ name = "test3",
+ )
+ ]
+
+ volumes_list = [
+ V1Volume(
+ name="test",
+ empty_dir=V1EmptyDirVolumeSource(size_limit="2Gi"),
+ ),
+ V1Volume(
+ name="test2",
+ config_map=V1ConfigMapVolumeSource(
+ name="test-config-map",
+ items=[V1KeyToPath(key="test", path="data.txt")]
+ )
+ ),
+ V1Volume(
+ name="test3",
+ secret=V1SecretVolumeSource(
+ secret_name="test-secret"
+ )
+ )
+ ]
+
+| For more information on creating Volumes and Volume Mounts with Python check out the Python Kubernetes docs (`Volumes `__, `Volume Mounts `__).
+| You can also find further information on Volumes and Volume Mounts by visiting the Kubernetes `documentation `__.
+
Deprecating Parameters
----------------------
diff --git a/docs/sphinx/user-docs/e2e.rst b/docs/sphinx/user-docs/e2e.rst
index 846536f1..6f3d1462 100644
--- a/docs/sphinx/user-docs/e2e.rst
+++ b/docs/sphinx/user-docs/e2e.rst
@@ -4,7 +4,7 @@ Running e2e tests locally
Pre-requisites
^^^^^^^^^^^^^^
-- We recommend using Python 3.9, along with Poetry.
+- We recommend using Python 3.11, along with Poetry.
On KinD clusters
----------------
diff --git a/poetry.lock b/poetry.lock
index 2c9b713e..88224764 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1457,13 +1457,13 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"]
[[package]]
name = "jinja2"
-version = "3.1.4"
+version = "3.1.5"
description = "A very fast and expressive template engine."
optional = false
python-versions = ">=3.7"
files = [
- {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
- {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
+ {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"},
+ {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"},
]
[package.dependencies]
diff --git a/src/codeflare_sdk/common/kueue/test_kueue.py b/src/codeflare_sdk/common/kueue/test_kueue.py
index 77095d4d..0093058c 100644
--- a/src/codeflare_sdk/common/kueue/test_kueue.py
+++ b/src/codeflare_sdk/common/kueue/test_kueue.py
@@ -11,7 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from ..utils.unit_test_support import get_local_queue, createClusterConfig
+from ..utils.unit_test_support import (
+ apply_template,
+ get_local_queue,
+ createClusterConfig,
+ get_template_variables,
+)
from unittest.mock import patch
from codeflare_sdk.ray.cluster.cluster import Cluster, ClusterConfiguration
import yaml
@@ -52,21 +57,21 @@ def test_cluster_creation_no_aw_local_queue(mocker):
config.local_queue = "local-queue-default"
cluster = Cluster(config)
assert cluster.resource_yaml == f"{aw_dir}unit-test-cluster-kueue.yaml"
- assert filecmp.cmp(
- f"{aw_dir}unit-test-cluster-kueue.yaml",
+ expected_rc = apply_template(
f"{parent}/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml",
- shallow=True,
+ get_template_variables(),
)
+ with open(f"{aw_dir}unit-test-cluster-kueue.yaml", "r") as f:
+ cluster_kueue = yaml.load(f, Loader=yaml.FullLoader)
+ assert cluster_kueue == expected_rc
+
# With resources loaded in memory, no Local Queue specified.
config = createClusterConfig()
config.name = "unit-test-cluster-kueue"
config.write_to_file = False
cluster = Cluster(config)
-
- with open(f"{parent}/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml") as f:
- expected_rc = yaml.load(f, Loader=yaml.FullLoader)
- assert cluster.resource_yaml == expected_rc
+ assert cluster.resource_yaml == expected_rc
def test_aw_creation_local_queue(mocker):
@@ -86,12 +91,15 @@ def test_aw_creation_local_queue(mocker):
config.local_queue = "local-queue-default"
cluster = Cluster(config)
assert cluster.resource_yaml == f"{aw_dir}unit-test-aw-kueue.yaml"
- assert filecmp.cmp(
- f"{aw_dir}unit-test-aw-kueue.yaml",
+ expected_rc = apply_template(
f"{parent}/tests/test_cluster_yamls/kueue/aw_kueue.yaml",
- shallow=True,
+ get_template_variables(),
)
+ with open(f"{aw_dir}unit-test-aw-kueue.yaml", "r") as f:
+ aw_kueue = yaml.load(f, Loader=yaml.FullLoader)
+ assert aw_kueue == expected_rc
+
# With resources loaded in memory, no Local Queue specified.
config = createClusterConfig()
config.name = "unit-test-aw-kueue"
@@ -99,9 +107,7 @@ def test_aw_creation_local_queue(mocker):
config.write_to_file = False
cluster = Cluster(config)
- with open(f"{parent}/tests/test_cluster_yamls/kueue/aw_kueue.yaml") as f:
- expected_rc = yaml.load(f, Loader=yaml.FullLoader)
- assert cluster.resource_yaml == expected_rc
+ assert cluster.resource_yaml == expected_rc
def test_get_local_queue_exists_fail(mocker):
diff --git a/src/codeflare_sdk/common/utils/unit_test_support.py b/src/codeflare_sdk/common/utils/unit_test_support.py
index 8e034378..373283b8 100644
--- a/src/codeflare_sdk/common/utils/unit_test_support.py
+++ b/src/codeflare_sdk/common/utils/unit_test_support.py
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import string
+import sys
from codeflare_sdk.ray.cluster.cluster import (
Cluster,
ClusterConfiguration,
@@ -20,6 +22,7 @@
import yaml
from pathlib import Path
from kubernetes import client
+from kubernetes.client import V1Toleration
from unittest.mock import patch
parent = Path(__file__).resolve().parents[4] # project directory
@@ -255,6 +258,32 @@ def arg_check_del_effect(group, version, namespace, plural, name, *args):
assert name == "ray-dashboard-unit-test-cluster-ray"
+def apply_template(yaml_file_path, variables):
+ with open(yaml_file_path, "r") as file:
+ yaml_content = file.read()
+
+ # Create a Template instance and substitute the variables
+ template = string.Template(yaml_content)
+ filled_yaml = template.substitute(variables)
+
+ # Now load the filled YAML into a Python object
+ return yaml.load(filled_yaml, Loader=yaml.FullLoader)
+
+
+def get_expected_image():
+ python_version = sys.version_info
+ if python_version.major == 3 and python_version.minor == 9:
+ return "quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06"
+ else:
+ return "quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4"
+
+
+def get_template_variables():
+ return {
+ "image": get_expected_image(),
+ }
+
+
def arg_check_apply_effect(group, version, namespace, plural, body, *args):
assert namespace == "ns"
assert args == tuple()
@@ -389,6 +418,7 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
)
+ volumes, volume_mounts = get_example_extended_storage_opts()
config = ClusterConfiguration(
name=cluster_name,
@@ -398,8 +428,18 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu
head_memory_requests=12,
head_memory_limits=16,
head_extended_resource_requests={"nvidia.com/gpu": 1, "intel.com/gpu": 2},
+ head_tolerations=[
+ V1Toleration(
+ key="key1", operator="Equal", value="value1", effect="NoSchedule"
+ )
+ ],
worker_cpu_requests=4,
worker_cpu_limits=8,
+ worker_tolerations=[
+ V1Toleration(
+ key="key2", operator="Equal", value="value2", effect="NoSchedule"
+ )
+ ],
num_workers=10,
worker_memory_requests=12,
worker_memory_limits=16,
@@ -414,5 +454,47 @@ def create_cluster_all_config_params(mocker, cluster_name, is_appwrapper) -> Clu
extended_resource_mapping={"example.com/gpu": "GPU", "intel.com/gpu": "TPU"},
overwrite_default_resource_mapping=True,
local_queue="local-queue-default",
+ annotations={"key1": "value1", "key2": "value2"},
+ volumes=volumes,
+ volume_mounts=volume_mounts,
)
return Cluster(config)
+
+
+def get_example_extended_storage_opts():
+ from kubernetes.client import (
+ V1Volume,
+ V1VolumeMount,
+ V1EmptyDirVolumeSource,
+ V1ConfigMapVolumeSource,
+ V1KeyToPath,
+ V1SecretVolumeSource,
+ )
+
+ volume_mounts = [
+ V1VolumeMount(mount_path="/home/ray/test1", name="test"),
+ V1VolumeMount(
+ mount_path="/home/ray/test2",
+ name="test2",
+ ),
+ V1VolumeMount(
+ mount_path="/home/ray/test2",
+ name="test3",
+ ),
+ ]
+
+ volumes = [
+ V1Volume(
+ name="test",
+ empty_dir=V1EmptyDirVolumeSource(size_limit="500Gi"),
+ ),
+ V1Volume(
+ name="test2",
+ config_map=V1ConfigMapVolumeSource(
+ name="config-map-test",
+ items=[V1KeyToPath(key="test", path="/home/ray/test2/data.txt")],
+ ),
+ ),
+ V1Volume(name="test3", secret=V1SecretVolumeSource(secret_name="test-secret")),
+ ]
+ return volumes, volume_mounts
diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py
index e590d483..215ac32e 100644
--- a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py
+++ b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py
@@ -16,7 +16,7 @@
This sub-module exists primarily to be used internally by the Cluster object
(in the cluster sub-module) for RayCluster/AppWrapper generation.
"""
-from typing import Union, Tuple, Dict
+from typing import List, Union, Tuple, Dict
from ...common import _kube_api_error_handling
from ...common.kubernetes_cluster import get_api_client, config_check
from kubernetes.client.exceptions import ApiException
@@ -40,6 +40,7 @@
V1PodTemplateSpec,
V1PodSpec,
V1LocalObjectReference,
+ V1Toleration,
)
import yaml
@@ -139,7 +140,11 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
"resources": head_resources,
},
"template": {
- "spec": get_pod_spec(cluster, [get_head_container_spec(cluster)])
+ "spec": get_pod_spec(
+ cluster,
+ [get_head_container_spec(cluster)],
+ cluster.config.head_tolerations,
+ )
},
},
"workerGroupSpecs": [
@@ -154,7 +159,11 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
"resources": worker_resources,
},
"template": V1PodTemplateSpec(
- spec=get_pod_spec(cluster, [get_worker_container_spec(cluster)])
+ spec=get_pod_spec(
+ cluster,
+ [get_worker_container_spec(cluster)],
+ cluster.config.worker_tolerations,
+ )
),
}
],
@@ -191,7 +200,7 @@ def get_metadata(cluster: "codeflare_sdk.ray.cluster.Cluster"):
)
# Get the NB annotation if it exists - could be useful in future for a "annotations" parameter.
- annotations = get_nb_annotations()
+ annotations = with_nb_annotations(cluster.config.annotations)
if annotations != {}:
object_meta.annotations = annotations # As annotations are not a guarantee they are appended to the metadata after creation.
return object_meta
@@ -213,11 +222,10 @@ def get_labels(cluster: "codeflare_sdk.ray.cluster.Cluster"):
return labels
-def get_nb_annotations():
+def with_nb_annotations(annotations: dict):
"""
- The get_nb_annotations() function generates the annotation for NB Prefix if the SDK is running in a notebook
+ The with_nb_annotations() function generates the annotation for NB Prefix if the SDK is running in a notebook and appends any user set annotations
"""
- annotations = {}
# Notebook annotation
nb_prefix = os.environ.get("NB_PREFIX")
@@ -244,14 +252,21 @@ def update_image(image) -> str:
return image
-def get_pod_spec(cluster: "codeflare_sdk.ray.cluster.Cluster", containers):
+def get_pod_spec(
+ cluster: "codeflare_sdk.ray.cluster.Cluster",
+ containers: List,
+ tolerations: List[V1Toleration],
+) -> V1PodSpec:
"""
The get_pod_spec() function generates a V1PodSpec for the head/worker containers
"""
+
pod_spec = V1PodSpec(
containers=containers,
- volumes=VOLUMES,
+ volumes=generate_custom_storage(cluster.config.volumes, VOLUMES),
+ tolerations=tolerations or None,
)
+
if cluster.config.image_pull_secrets != []:
pod_spec.image_pull_secrets = generate_image_pull_secrets(cluster)
@@ -296,7 +311,9 @@ def get_head_container_spec(
cluster.config.head_memory_limits,
cluster.config.head_extended_resource_requests,
),
- volume_mounts=VOLUME_MOUNTS,
+ volume_mounts=generate_custom_storage(
+ cluster.config.volume_mounts, VOLUME_MOUNTS
+ ),
)
if cluster.config.envs != {}:
head_container.env = generate_env_vars(cluster)
@@ -338,7 +355,9 @@ def get_worker_container_spec(
cluster.config.worker_memory_limits,
cluster.config.worker_extended_resource_requests,
),
- volume_mounts=VOLUME_MOUNTS,
+ volume_mounts=generate_custom_storage(
+ cluster.config.volume_mounts, VOLUME_MOUNTS
+ ),
)
if cluster.config.envs != {}:
@@ -522,6 +541,22 @@ def wrap_cluster(
# Etc.
+def generate_custom_storage(provided_storage: list, default_storage: list):
+ """
+ The generate_custom_storage function updates the volumes/volume mounts configs with the default volumes/volume mounts.
+ """
+ storage_list = provided_storage.copy()
+
+ if storage_list == []:
+ storage_list = default_storage
+ else:
+ # We append the list of volumes/volume mounts with the defaults and return the full list
+ for storage in default_storage:
+ storage_list.append(storage)
+
+ return storage_list
+
+
def write_to_file(cluster: "codeflare_sdk.ray.cluster.Cluster", resource: dict):
"""
The write_to_file function writes the built Ray Cluster/AppWrapper dict as a yaml file in the .codeflare folder
diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py
index b8b097f8..ab64be83 100644
--- a/src/codeflare_sdk/ray/cluster/config.py
+++ b/src/codeflare_sdk/ray/cluster/config.py
@@ -22,6 +22,7 @@
import warnings
from dataclasses import dataclass, field, fields
from typing import Dict, List, Optional, Union, get_args, get_origin
+from kubernetes.client import V1Toleration, V1Volume, V1VolumeMount
dir = pathlib.Path(__file__).parent.parent.resolve()
@@ -57,6 +58,8 @@ class ClusterConfiguration:
The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests)
head_extended_resource_requests:
A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1}
+ head_tolerations:
+ List of tolerations for head nodes.
min_cpus:
The minimum number of CPUs to allocate to each worker.
max_cpus:
@@ -69,6 +72,8 @@ class ClusterConfiguration:
The maximum amount of memory to allocate to each worker.
num_gpus:
The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests)
+ worker_tolerations:
+ List of tolerations for worker nodes.
appwrapper:
A boolean indicating whether to use an AppWrapper.
envs:
@@ -89,6 +94,12 @@ class ClusterConfiguration:
A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names
overwrite_default_resource_mapping:
A boolean indicating whether to overwrite the default resource mapping.
+ annotations:
+ A dictionary of annotations to apply to the cluster.
+ volumes:
+ A list of V1Volume objects to add to the Cluster
+ volume_mounts:
+ A list of V1VolumeMount objects to add to the Cluster
"""
name: str
@@ -103,6 +114,7 @@ class ClusterConfiguration:
head_extended_resource_requests: Dict[str, Union[str, int]] = field(
default_factory=dict
)
+ head_tolerations: Optional[List[V1Toleration]] = None
worker_cpu_requests: Union[int, str] = 1
worker_cpu_limits: Union[int, str] = 1
min_cpus: Optional[Union[int, str]] = None # Deprecating
@@ -113,6 +125,7 @@ class ClusterConfiguration:
min_memory: Optional[Union[int, str]] = None # Deprecating
max_memory: Optional[Union[int, str]] = None # Deprecating
num_gpus: Optional[int] = None # Deprecating
+ worker_tolerations: Optional[List[V1Toleration]] = None
appwrapper: bool = False
envs: Dict[str, str] = field(default_factory=dict)
image: str = ""
@@ -126,6 +139,9 @@ class ClusterConfiguration:
extended_resource_mapping: Dict[str, str] = field(default_factory=dict)
overwrite_default_resource_mapping: bool = False
local_queue: Optional[str] = None
+ annotations: Dict[str, str] = field(default_factory=dict)
+ volumes: list[V1Volume] = field(default_factory=list)
+ volume_mounts: list[V1VolumeMount] = field(default_factory=list)
def __post_init__(self):
if not self.verify_tls:
@@ -262,7 +278,10 @@ def check_type(value, expected_type):
if origin_type is Union:
return any(check_type(value, union_type) for union_type in args)
if origin_type is list:
- return all(check_type(elem, args[0]) for elem in value)
+ if value is not None:
+ return all(check_type(elem, args[0]) for elem in (value or []))
+ else:
+ return True
if origin_type is dict:
return all(
check_type(k, args[0]) and check_type(v, args[1])
diff --git a/src/codeflare_sdk/ray/cluster/test_config.py b/src/codeflare_sdk/ray/cluster/test_config.py
index 3416fc28..5302e0eb 100644
--- a/src/codeflare_sdk/ray/cluster/test_config.py
+++ b/src/codeflare_sdk/ray/cluster/test_config.py
@@ -13,15 +13,16 @@
# limitations under the License.
from codeflare_sdk.common.utils.unit_test_support import (
+ apply_template,
createClusterWrongType,
- get_local_queue,
+ get_example_extended_storage_opts,
create_cluster_all_config_params,
+ get_template_variables,
)
from codeflare_sdk.ray.cluster.cluster import ClusterConfiguration, Cluster
from pathlib import Path
import filecmp
import pytest
-import yaml
import os
parent = Path(__file__).resolve().parents[4] # project directory
@@ -36,9 +37,11 @@ def test_default_cluster_creation(mocker):
cluster = Cluster(ClusterConfiguration(name="default-cluster", namespace="ns"))
- with open(f"{expected_clusters_dir}/ray/default-ray-cluster.yaml") as f:
- expected_rc = yaml.load(f, Loader=yaml.FullLoader)
- assert cluster.resource_yaml == expected_rc
+ expected_rc = apply_template(
+ f"{expected_clusters_dir}/ray/default-ray-cluster.yaml",
+ get_template_variables(),
+ )
+ assert cluster.resource_yaml == expected_rc
def test_default_appwrapper_creation(mocker):
@@ -50,9 +53,10 @@ def test_default_appwrapper_creation(mocker):
ClusterConfiguration(name="default-appwrapper", namespace="ns", appwrapper=True)
)
- with open(f"{expected_clusters_dir}/ray/default-appwrapper.yaml") as f:
- expected_aw = yaml.load(f, Loader=yaml.FullLoader)
- assert cluster.resource_yaml == expected_aw
+ expected_aw = apply_template(
+ f"{expected_clusters_dir}/ray/default-appwrapper.yaml", get_template_variables()
+ )
+ assert cluster.resource_yaml == expected_aw
def test_config_creation_all_parameters(mocker):
@@ -61,6 +65,7 @@ def test_config_creation_all_parameters(mocker):
expected_extended_resource_mapping = DEFAULT_RESOURCE_MAPPING
expected_extended_resource_mapping.update({"example.com/gpu": "GPU"})
expected_extended_resource_mapping["intel.com/gpu"] = "TPU"
+ volumes, volume_mounts = get_example_extended_storage_opts()
cluster = create_cluster_all_config_params(mocker, "test-all-params", False)
assert cluster.config.name == "test-all-params" and cluster.config.namespace == "ns"
@@ -90,6 +95,13 @@ def test_config_creation_all_parameters(mocker):
)
assert cluster.config.overwrite_default_resource_mapping == True
assert cluster.config.local_queue == "local-queue-default"
+ assert cluster.config.annotations == {
+ "app.kubernetes.io/managed-by": "test-prefix",
+ "key1": "value1",
+ "key2": "value2",
+ }
+ assert cluster.config.volumes == volumes
+ assert cluster.config.volume_mounts == volume_mounts
assert filecmp.cmp(
f"{aw_dir}test-all-params.yaml",
diff --git a/tests/e2e/install-codeflare-sdk.sh b/tests/e2e/install-codeflare-sdk.sh
index e7808582..8ec5e1e6 100644
--- a/tests/e2e/install-codeflare-sdk.sh
+++ b/tests/e2e/install-codeflare-sdk.sh
@@ -9,7 +9,7 @@ poetry config virtualenvs.create false
cd codeflare-sdk
# Lock dependencies and install them
-poetry lock --no-update
+poetry lock
poetry install --with test,docs
# Return to the workdir
diff --git a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml
index 6d2c5440..0977d659 100644
--- a/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml
+++ b/tests/test_cluster_yamls/appwrapper/unit-test-all-params.yaml
@@ -13,6 +13,8 @@ spec:
metadata:
annotations:
app.kubernetes.io/managed-by: test-prefix
+ key1: value1
+ key2: value2
labels:
controller-tools.k8s.io: '1.0'
key1: value1
@@ -76,6 +78,12 @@ spec:
memory: 12G
nvidia.com/gpu: 1
volumeMounts:
+ - mountPath: /home/ray/test1
+ name: test
+ - mountPath: /home/ray/test2
+ name: test2
+ - mountPath: /home/ray/test2
+ name: test3
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
@@ -91,7 +99,24 @@ spec:
imagePullSecrets:
- name: secret1
- name: secret2
+ tolerations:
+ - effect: NoSchedule
+ key: key1
+ operator: Equal
+ value: value1
volumes:
+ - emptyDir:
+ sizeLimit: 500Gi
+ name: test
+ - configMap:
+ items:
+ - key: test
+ path: /home/ray/test2/data.txt
+ name: config-map-test
+ name: test2
+ - name: test3
+ secret:
+ secretName: test-secret
- configMap:
items:
- key: ca-bundle.crt
@@ -144,6 +169,12 @@ spec:
memory: 12G
nvidia.com/gpu: 1
volumeMounts:
+ - mountPath: /home/ray/test1
+ name: test
+ - mountPath: /home/ray/test2
+ name: test2
+ - mountPath: /home/ray/test2
+ name: test3
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
@@ -159,7 +190,24 @@ spec:
imagePullSecrets:
- name: secret1
- name: secret2
+ tolerations:
+ - effect: NoSchedule
+ key: key2
+ operator: Equal
+ value: value2
volumes:
+ - emptyDir:
+ sizeLimit: 500Gi
+ name: test
+ - configMap:
+ items:
+ - key: test
+ path: /home/ray/test2/data.txt
+ name: config-map-test
+ name: test2
+ - name: test3
+ secret:
+ secretName: test-secret
- configMap:
items:
- key: ca-bundle.crt
diff --git a/tests/test_cluster_yamls/kueue/aw_kueue.yaml b/tests/test_cluster_yamls/kueue/aw_kueue.yaml
index 402ffb6a..b5b5ae3f 100644
--- a/tests/test_cluster_yamls/kueue/aw_kueue.yaml
+++ b/tests/test_cluster_yamls/kueue/aw_kueue.yaml
@@ -38,7 +38,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
@@ -103,7 +103,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
diff --git a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml
index a5cb3616..ad179a0b 100644
--- a/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml
+++ b/tests/test_cluster_yamls/kueue/ray_cluster_kueue.yaml
@@ -38,7 +38,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
@@ -103,7 +103,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
diff --git a/tests/test_cluster_yamls/ray/default-appwrapper.yaml b/tests/test_cluster_yamls/ray/default-appwrapper.yaml
index 3e97474d..eadfeaa0 100644
--- a/tests/test_cluster_yamls/ray/default-appwrapper.yaml
+++ b/tests/test_cluster_yamls/ray/default-appwrapper.yaml
@@ -36,7 +36,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
@@ -101,7 +101,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
diff --git a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml
index 34de53d2..056479e6 100644
--- a/tests/test_cluster_yamls/ray/default-ray-cluster.yaml
+++ b/tests/test_cluster_yamls/ray/default-ray-cluster.yaml
@@ -28,7 +28,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
@@ -93,7 +93,7 @@ spec:
template:
spec:
containers:
- - image: quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+ - image: "${image}"
imagePullPolicy: Always
lifecycle:
preStop:
diff --git a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml
index 8426eede..188319ab 100644
--- a/tests/test_cluster_yamls/ray/unit-test-all-params.yaml
+++ b/tests/test_cluster_yamls/ray/unit-test-all-params.yaml
@@ -3,6 +3,8 @@ kind: RayCluster
metadata:
annotations:
app.kubernetes.io/managed-by: test-prefix
+ key1: value1
+ key2: value2
labels:
controller-tools.k8s.io: '1.0'
key1: value1
@@ -67,6 +69,12 @@ spec:
memory: 12G
nvidia.com/gpu: 1
volumeMounts:
+ - mountPath: /home/ray/test1
+ name: test
+ - mountPath: /home/ray/test2
+ name: test2
+ - mountPath: /home/ray/test2
+ name: test3
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
@@ -82,7 +90,24 @@ spec:
imagePullSecrets:
- name: secret1
- name: secret2
+ tolerations:
+ - effect: NoSchedule
+ key: key1
+ operator: Equal
+ value: value1
volumes:
+ - emptyDir:
+ sizeLimit: 500Gi
+ name: test
+ - configMap:
+ items:
+ - key: test
+ path: /home/ray/test2/data.txt
+ name: config-map-test
+ name: test2
+ - name: test3
+ secret:
+ secretName: test-secret
- configMap:
items:
- key: ca-bundle.crt
@@ -135,6 +160,12 @@ spec:
memory: 12G
nvidia.com/gpu: 1
volumeMounts:
+ - mountPath: /home/ray/test1
+ name: test
+ - mountPath: /home/ray/test2
+ name: test2
+ - mountPath: /home/ray/test2
+ name: test3
- mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
subPath: odh-trusted-ca-bundle.crt
@@ -150,7 +181,24 @@ spec:
imagePullSecrets:
- name: secret1
- name: secret2
+ tolerations:
+ - effect: NoSchedule
+ key: key2
+ operator: Equal
+ value: value2
volumes:
+ - emptyDir:
+ sizeLimit: 500Gi
+ name: test
+ - configMap:
+ items:
+ - key: test
+ path: /home/ray/test2/data.txt
+ name: config-map-test
+ name: test2
+ - name: test3
+ secret:
+ secretName: test-secret
- configMap:
items:
- key: ca-bundle.crt