IBM · christian-pinto · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/actuator.py
@@ -3,8 +3,8 @@
 
 import json
 import logging
-import os
 import uuid
+from pathlib import Path
 
 import ray
 import yaml
@@ -54,14 +54,27 @@ def catalog(
     ) -> ExperimentCatalog:
         """Returns the Experiments your actuator provides"""
 
-        # The catalog be formed in code here or read from a file containing the Experiments models
-        # This shows reading from a file
+        # Loading experiment definitions for yaml files contained in the `experiments` directory.
+        # NOTE: Only files can be placed in the experiments directory,
+        #       but each file can contain multiple experiment definitions
+        curr_path = Path(__file__)
+        exp_dir = curr_path.parent / Path("experiments")
+        logger.debug(f"Experiments dir {exp_dir.absolute()}")
+        experiments = []
+        for exp_file in exp_dir.iterdir():
+            if exp_file.is_dir():
+                continue
+
+            logger.debug(f"Loading experiments from {exp_file.name}")
+            try:
+                file_data = exp_file.read_text()
+                data = yaml.safe_load(file_data)
+            except yaml.YAMLError:
+                error_message = f"File {exp_file.name} is a malformed YAML"
+                logger.error(error_message)
+                raise ValueError(error_message)
 
-        path = os.path.abspath(__file__)
-        path = os.path.split(path)[0]
-        with open(os.path.join(path, "experiments.yaml")) as f:
-            data = yaml.safe_load(f)
-            experiments = [Experiment(**data[e]) for e in data]
+            experiments.extend([Experiment.model_validate(data[e]) for e in data])
 
         return ExperimentCatalog(
             catalogIdentifier=cls.identifier,
@@ -176,7 +189,11 @@ async def submit(
         if experiment.deprecated is True:
             raise DeprecatedExperimentError(f"Experiment {experiment} is deprecated")
 
-        if experiment.identifier == "performance-testing-full":
+        if experiment.identifier in [
+            "performance-testing-full",
+            "performance-testing-geospatial-full",
+            "performance-testing-geospatial-full-custom-dataset",
+        ]:
             if not self.env_manager:
                 raise MissingConfigurationForExperimentError(
                     f"Actuator configuration did not contain sufficient information for a kubernetes environment manager to be created. "
@@ -197,7 +214,7 @@ async def submit(
                     )
 
             # Execute experiment
-            # Note: Here the experiment instance is just past for convenience since we retrieved it above
+            # Note: Here the experiment instance is just passed for convenience since we retrieved it above
             run_resource_and_workload_experiment.remote(
                 request=request,
                 experiment=experiment,

diff --git a/...ators/vllm_performance/ado_actuators/vllm_performance/datasets/india_url_in_b64_out.jsonl b/...ators/vllm_performance/ado_actuators/vllm_performance/datasets/india_url_in_b64_out.jsonl
@@ -0,0 +1 @@
+{"prompt":{"data": {"data": "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif","data_format": "url","out_data_format": "b64_json","indices": [1, 2, 3, 8, 11, 12]},"priority": 0,"softmax": false}}
diff --git a/...llm_performance/geospatial_valencia.jsonl → ...ce/datasets/valencia_url_in_b64_out.jsonl b/...llm_performance/geospatial_valencia.jsonl → ...ce/datasets/valencia_url_in_b64_out.jsonl
diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 import time
+import traceback
 
 import ray
 from ado_actuators.vllm_performance.actuator_parameters import (
@@ -22,6 +23,7 @@
     VLLMDtype,
 )
 from ado_actuators.vllm_performance.vllm_performance_test.execute_benchmark import (
+    execute_geospatial_benchmark,
     execute_random_benchmark,
 )
 from ray.actor import ActorHandle
@@ -142,6 +144,9 @@ def _create_environment(
                         reuse_deployment=False,
                         pvc_name=actuator.pvc_template,
                         namespace=actuator.namespace,
+                        skip_tokenizer_init=values.get("skip_tokenizer_init"),
+                        enforce_eager=values.get("enforce_eager"),
+                        io_processor_plugin=values.get("io_processor_plugin"),
                     )
                     # Update manager
                     env_manager.done_creating.remote(definition=definition)
@@ -151,6 +156,7 @@ def _create_environment(
                     logger.error(
                         f"Attempt {attempt}. Failed to create test environment {e}"
                     )
+                    logger.error(traceback.format_exception(e))
                     error = f"Failed to create test environment {e}"
                     time.sleep(tmout)
                     tmout *= 2
@@ -279,23 +285,42 @@ def run_resource_and_workload_experiment(
                 start = time.time()
                 result = None
                 try:
-                    result = execute_random_benchmark(
-                        base_url=base_url,
-                        model=values.get("model"),
-                        interpreter=actuator_parameters.interpreter,
-                        num_prompts=int(values.get("num_prompts")),
-                        request_rate=request_rate,
-                        max_concurrency=max_concurrency,
-                        hf_token=actuator_parameters.hf_token,
-                        benchmark_retries=actuator_parameters.benchmark_retries,
-                        retries_timeout=actuator_parameters.retries_timeout,
-                        number_input_tokens=int(values.get("number_input_tokens")),
-                        max_output_tokens=int(values.get("max_output_tokens")),
-                        burstiness=float(values.get("burstiness")),
-                    )
+                    if experiment.identifier in [
+                        "performance-testing-geospatial-full",
+                        "performance-testing-geospatial-full-custom-dataset",
+                    ]:
+                        result = execute_geospatial_benchmark(
+                            base_url=base_url,
+                            model=values.get("model"),
+                            interpreter=actuator_parameters.interpreter,
+                            num_prompts=int(values.get("num_prompts")),
+                            request_rate=request_rate,
+                            max_concurrency=max_concurrency,
+                            hf_token=actuator_parameters.hf_token,
+                            benchmark_retries=actuator_parameters.benchmark_retries,
+                            retries_timeout=actuator_parameters.retries_timeout,
+                            burstiness=float(values.get("burstiness")),
+                            dataset=values.get("dataset"),
+                        )
+                    else:
+                        result = execute_random_benchmark(
+                            base_url=base_url,
+                            model=values.get("model"),
+                            interpreter=actuator_parameters.interpreter,
+                            num_prompts=int(values.get("num_prompts")),
+                            request_rate=request_rate,
+                            max_concurrency=max_concurrency,
+                            hf_token=actuator_parameters.hf_token,
+                            benchmark_retries=actuator_parameters.benchmark_retries,
+                            retries_timeout=actuator_parameters.retries_timeout,
+                            number_input_tokens=int(values.get("number_input_tokens")),
+                            max_output_tokens=int(values.get("max_output_tokens")),
+                            burstiness=float(values.get("burstiness")),
+                            dataset=values.get("dataset"),
+                        )
                     logger.debug(f"benchmark executed in {time.time() - start} sec")
                 except Exception as e:
-                    logger.error(f"Failed to execute VLLM performance test {e}")
+                    logger.error(traceback.format_exception(e))
                     error = f"Failed to execute VLLM performance test {e}"
                 finally:
                     if pf is not None:
@@ -379,20 +404,36 @@ def run_workload_experiment(
         error = None
         measured_values = []
         try:
-            result = execute_random_benchmark(
-                base_url=values.get("endpoint"),
-                model=values.get("model"),
-                interpreter=actuator_parameters.interpreter,
-                num_prompts=int(values.get("num_prompts")),
-                request_rate=request_rate,
-                max_concurrency=max_concurrency,
-                hf_token=actuator_parameters.hf_token,
-                benchmark_retries=actuator_parameters.benchmark_retries,
-                retries_timeout=actuator_parameters.retries_timeout,
-                number_input_tokens=int(values.get("number_input_tokens")),
-                max_output_tokens=int(values.get("max_output_tokens")),
-                burstiness=float(values.get("burstiness")),
-            )
+            if experiment.identifier == "performance-testing-geospatial-endpoint":
+                result = execute_geospatial_benchmark(
+                    base_url=values.get("endpoint"),
+                    model=values.get("model"),
+                    interpreter=actuator_parameters.interpreter,
+                    num_prompts=int(values.get("num_prompts")),
+                    request_rate=request_rate,
+                    max_concurrency=max_concurrency,
+                    hf_token=actuator_parameters.hf_token,
+                    benchmark_retries=actuator_parameters.benchmark_retries,
+                    retries_timeout=actuator_parameters.retries_timeout,
+                    burstiness=float(values.get("burstiness")),
+                    dataset=values.get("dataset"),
+                )
+            else:
+                result = execute_random_benchmark(
+                    base_url=values.get("endpoint"),
+                    model=values.get("model"),
+                    interpreter=actuator_parameters.interpreter,
+                    num_prompts=int(values.get("num_prompts")),
+                    request_rate=request_rate,
+                    max_concurrency=max_concurrency,
+                    hf_token=actuator_parameters.hf_token,
+                    benchmark_retries=actuator_parameters.benchmark_retries,
+                    retries_timeout=actuator_parameters.retries_timeout,
+                    number_input_tokens=int(values.get("number_input_tokens")),
+                    max_output_tokens=int(values.get("max_output_tokens")),
+                    burstiness=float(values.get("burstiness")),
+                    dataset=values.get("dataset"),
+                )
         except Exception as e:
             logger.error(f"Failed to execute VLLM performance test {e}")
             error = f"Failed to execute VLLM performance test {e}"

diff --git a/...tuators/vllm_performance/experiments.yaml → ...ance/experiments/performance_testing.yaml b/...tuators/vllm_performance/experiments.yaml → ...ance/experiments/performance_testing.yaml
@@ -56,6 +56,12 @@ performance_testing-full:
         variableType: 'DISCRETE_VARIABLE_TYPE'
         domainRange: [ 1, 10000 ]
         interval: 1
+    - identifier: 'dataset'
+      metadata:
+        description: "(benchmark) The dataset to be used for the experiment"
+      propertyDomain:
+        variableType: "CATEGORICAL_VARIABLE_TYPE"
+        values: [ 'random' ]
     - identifier: image
       metadata:
         description: "(deployment) Docker image to use to create vllm deployments"
@@ -120,6 +126,18 @@ performance_testing-full:
       propertyDomain:
         variableType: "CATEGORICAL_VARIABLE_TYPE"
         values: [ 'NVIDIA-A100-80GB-PCIe', 'NVIDIA-A100-SXM4-80GB' ]
+    - identifier: 'skip_tokenizer_init'
+      metadata:
+        description: "(deployment) skip tokenizer initialization"
+      propertyDomain:
+        variableType: BINARY_VARIABLE_TYPE 
+        values: [True, False]
+    - identifier: 'enforce_eager'
+      metadata:
+        description: "(deployment) enforce PyTorch eager mode"
+      propertyDomain:
+        variableType: BINARY_VARIABLE_TYPE 
+        values: [True, False]
   defaultParameterization:
     - property:
         identifier: 'image'
@@ -149,6 +167,9 @@ performance_testing-full:
     - property:
         identifier: 'max_output_tokens'
       value: 128
+    - property:
+        identifier: 'dataset'
+      value: 'random'
     - property:
         identifier: 'gpu_memory_utilization'
       value: .9
@@ -167,6 +188,12 @@ performance_testing-full:
     - property:
         identifier: 'gpu_type'
       value: 'NVIDIA-A100-80GB-PCIe'
+    - property:
+        identifier: 'skip_tokenizer_init'
+      value: False
+    - property:
+        identifier: 'enforce_eager'
+      value: False
   # measurements
   targetProperties:
     - identifier: "duration"
@@ -221,6 +248,7 @@ performance_testing-endpoint:
         description: 'The endpoint(s) to test'
       propertyDomain:
         variableType: "OPEN_CATEGORICAL_VARIABLE_TYPE"
+        values: ["http://localhost:8000"]
     - identifier: 'request_rate'
       metadata:
         description: "The number of requests to send per second"
@@ -264,6 +292,12 @@ performance_testing-endpoint:
         variableType: 'DISCRETE_VARIABLE_TYPE'
         domainRange: [ -1, 500 ] # -1 means no concurrency control
         interval: 1
+    - identifier: 'dataset'
+      metadata:
+        description: "(benchmark) The dataset to be used for the experiment"
+      propertyDomain:
+        variableType: "CATEGORICAL_VARIABLE_TYPE"
+        values: [ 'random' ]
   defaultParameterization:
     - value: 1000
       property:
@@ -280,6 +314,9 @@ performance_testing-endpoint:
     - value: 128
       property:
         identifier: 'max_output_tokens'
+    - property:
+        identifier: 'dataset'
+      value: 'random'
   # measurements
   targetProperties:
     - identifier: "duration"
@@ -318,4 +355,4 @@ performance_testing-endpoint:
     - identifier: "p75_e2el_ms"
     - identifier: "p99_e2el_ms"
   metadata:
-    description: 'Test inference performance of a model served by vLLM endpoint across inference workload configurations'
+    description: 'Test inference performance of a model served by vLLM endpoint across inference workload configurations'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"prompt":{"data": {"data": "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif","data_format": "url","out_data_format": "b64_json","indices": [1, 2, 3, 8, 11, 12]},"priority": 0,"softmax": false}}