From 8d2b20142b15794594b9003b9d95c0ef76ea4427 Mon Sep 17 00:00:00 2001 From: Amy Unruh Date: Tue, 23 Mar 2021 09:06:41 -0700 Subject: [PATCH 1/4] checkpointing --- .../components/ucaip/bw_ucaip_train_pl.py | 75 +++++++ .../ucaip/model_deploy_component.yaml | 132 +++++++++++ .../ucaip/model_train_component.yaml | 206 ++++++++++++++++++ .../ucaip/model_upload_component.yaml | 108 +++++++++ .../components/ucaip/serving/deploy_model.py | 97 +++++++++ .../components/ucaip/serving/model_upload.py | 81 +++++++ .../ucaip/training/bwmodel/__init__.py | 0 .../ucaip/training/bwmodel/model.py | 111 ++++++++++ .../ucaip/training/create_training_job.py | 149 +++++++++++++ .../components/ucaip/training/setup.py | 27 +++ .../ucaip/training/trainer/__init__.py | 0 .../components/ucaip/training/trainer/task.py | 141 ++++++++++++ 12 files changed, 1127 insertions(+) create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/__init__.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/model.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/setup.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/__init__.py create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py new file mode 100644 index 0000000..feaf33e --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py @@ -0,0 +1,75 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from kfp.v2 import dsl +from kfp.v2 import compiler +from kfp import components + +model_train_op = components.load_component_from_file( + './model_train_component.yaml' + ) + +model_deploy_op = components.load_component_from_file( + './model_deploy_component.yaml' + ) + + +@dsl.pipeline( + name='ucaip-model-train', + description='ucaip model train' +) +def model_train_pipeline( + prediction_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest', + # artifact_uri: str = 'gs://aju-pipelines/v64/077ae97e-9c6d-4c1c-b5a1-fc2e95fb7dbb/0/bwmodel/trained_model/export/bikesw/1615937808', + location: str = "us-central1", + api_endpoint: str = "us-central1-aiplatform.googleapis.com", + project: str = 'aju-vtests2', + training_display_name: str = 'CHANGE THIS', + model_display_name: str = 'CHANGE THIS', + endpoint_disp_name: str = 'CHANGE THIS', + executor_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest', + package_uri: str = 'gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz', + python_module: str = 'trainer.task', + base_output_directory_prefix: str = 'gs://aju-pipelines/ucaip/training2/', + timeout: int = 7200, + epochs: int = 5, + steps_per_epoch: int = -1, + hptune_dict: str = '{"num_hidden_layers": 3, "hidden_size": 32, "learning_rate": 0.01}', + data_dir: str = 'gs://aju-dev-demos-codelabs/bikes_weather/' + ): + + model_train = model_train_op( + project, training_display_name, model_display_name, + executor_image_uri, package_uri, python_module, + base_output_directory_prefix, + prediction_image_uri, + location, api_endpoint, epochs, + data_dir, steps_per_epoch, hptune_dict + ) + + model_deploy = model_deploy_op( + project, endpoint_disp_name, + model_train.outputs['model_id'], + model_display_name, + location, api_endpoint, timeout + ) + + + +if __name__ == '__main__': + PIPELINE_ROOT = 'gs://aju-pipelines/pipeline_root/ucaiptests' + compiler.Compiler().compile(pipeline_func=model_train_pipeline, + pipeline_root=PIPELINE_ROOT, + output_path='ucaip_train_pipeline_spec.json') diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml new file mode 100644 index 0000000..f7bb9b4 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml @@ -0,0 +1,132 @@ +name: Deploy model +inputs: +- {name: project, type: String} +- {name: endpoint_disp_name, type: String} +- {name: model_name, type: String} +- {name: deployed_model_display_name, type: String} +- {name: location, type: String, default: us-central1, optional: true} +- {name: api_endpoint, type: String, default: us-central1-aiplatform.googleapis.com, + optional: true} +- {name: timeout, type: Integer, default: '7200', optional: true} +implementation: + container: + image: gcr.io/aju-vtests2/bw-aiplatform:v1 + command: + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def deploy_model( + project, + endpoint_disp_name, + model_name, + deployed_model_display_name, + location = "us-central1", + api_endpoint = "us-central1-aiplatform.googleapis.com", + timeout = 7200, + ): + + import logging + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + + def create_endpoint( + project, + display_name, + client, + location = "us-central1", + api_endpoint = "us-central1-aiplatform.googleapis.com", + timeout = 300, + ): + + endpoint = {"display_name": display_name} + parent = f"projects/{project}/locations/{location}" + response = client.create_endpoint(parent=parent, endpoint=endpoint) + print("Long running operation:", response.operation.name) + create_endpoint_response = response.result(timeout=timeout) + print("create_endpoint_response:", create_endpoint_response) + endpoint_name = create_endpoint_response.name + logging.info('endpoint name: %s', endpoint_name) + return endpoint_name + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.EndpointServiceClient(client_options=client_options) + + # create endpoint + logging.info('creating endpoint %s', endpoint_disp_name) + endpoint_path = create_endpoint(project, endpoint_disp_name, client) + logging.info("using endpoint path ID %s", endpoint_path) + + deployed_model = { + # format: 'projects/{project}/locations/{location}/models/{model}' + "model": model_name, + "display_name": deployed_model_display_name, + # `dedicated_resources` must be used for non-AutoML models + "dedicated_resources": { + "min_replica_count": 1, + "machine_spec": { + "machine_type": "n1-standard-2", + # Accelerators can be used only if the model specifies a GPU image. + # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + # 'accelerator_count': 1, + }, + }, + } + # key '0' assigns traffic for the newly deployed model + # Traffic percentage values must add up to 100 + # Leave dictionary empty if endpoint should not accept any traffic + traffic_split = {"0": 100} + # endpoint = client.endpoint_path( + # project=project, location=location, endpoint=endpoint_id + # ) + response = client.deploy_model( + # endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split + endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split + ) + logging.info("Long running operation: %s", response.operation.name) + deploy_model_response = response.result(timeout=timeout) + logging.info("deploy_model_response: %s", deploy_model_response) + + import argparse + _parser = argparse.ArgumentParser(prog='Deploy model', description='') + _parser.add_argument("--project", dest="project", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--endpoint-disp-name", dest="endpoint_disp_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-name", dest="model_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--deployed-model-display-name", dest="deployed_model_display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--timeout", dest="timeout", type=int, required=False, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + + _outputs = deploy_model(**_parsed_args) + args: + - --project + - {inputValue: project} + - --endpoint-disp-name + - {inputValue: endpoint_disp_name} + - --model-name + - {inputValue: model_name} + - --deployed-model-display-name + - {inputValue: deployed_model_display_name} + - if: + cond: {isPresent: location} + then: + - --location + - {inputValue: location} + - if: + cond: {isPresent: api_endpoint} + then: + - --api-endpoint + - {inputValue: api_endpoint} + - if: + cond: {isPresent: timeout} + then: + - --timeout + - {inputValue: timeout} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml new file mode 100644 index 0000000..72c8e32 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml @@ -0,0 +1,206 @@ +name: Create training pipeline custom job +inputs: +- {name: project, type: String} +- {name: display_name, type: String} +- {name: model_display_name, type: String} +- {name: executor_image_uri, type: String} +- {name: package_uri, type: String} +- {name: python_module, type: String} +- {name: base_output_directory_prefix, type: String} +- {name: prediction_image_uri, type: String} +- {name: location, type: String} +- {name: api_endpoint, type: String} +- {name: epochs, type: Integer} +- {name: data_dir, type: String} +- {name: steps_per_epoch, type: Integer} +- {name: hptune_dict, type: String} +outputs: +- {name: model_id, type: String} +- {name: model_dispname, type: String} +implementation: + container: + image: gcr.io/aju-vtests2/bw-aiplatform:v1 + command: + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + def create_training_pipeline_custom_job( + project, + display_name, + model_display_name, + executor_image_uri, + package_uri, + python_module, + base_output_directory_prefix, + prediction_image_uri, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' + location, # "us-central1" + api_endpoint, # "us-central1-aiplatform.googleapis.com", + epochs, + data_dir, + steps_per_epoch, + hptune_dict, + model_id, + model_dispname + ): + + import logging + import subprocess + import time + + from google.cloud import aiplatform + from google.protobuf import json_format + from google.protobuf.struct_pb2 import Value + from google.cloud.aiplatform_v1beta1.types import pipeline_state + + logging.getLogger().setLevel(logging.INFO) + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.PipelineServiceClient(client_options=client_options) + + training_task_inputs_dict = { + "workerPoolSpecs": [ + { + # "machine_spec": {"machineType": "n1-standard-16"}, + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "python_package_spec": { + "executor_image_uri": executor_image_uri, + "package_uris": [package_uri], + "python_module": python_module, + "args": [f"--epochs={epochs}", f"--data-dir={data_dir}", + f"--steps-per-epoch={steps_per_epoch}", f"--hptune-dict={hptune_dict}"], + }, + } + # { + # "replicaCount": 1, + # "machineSpec": {"machineType": "n1-standard-4"}, + # "containerSpec": { + # # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job + # "imageUri": container_image_uri, + # "args": [ + # # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. + # "--model_dir=$(AIP_MODEL_DIR)", + # ], + # }, + # } + ], + "baseOutputDirectory": { + # The GCS location for outputs must be accessible by the project's AI Platform service account. + "output_uri_prefix": base_output_directory_prefix + }, + } + training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value()) + + training_task_definition = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" + # image_uri = "gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest" + # image_uri = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' + + training_pipeline = { + "display_name": display_name, + "training_task_definition": training_task_definition, + "training_task_inputs": training_task_inputs, + "model_to_upload": { + "display_name": model_display_name, + "container_spec": {"image_uri": prediction_image_uri}, + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.create_training_pipeline( + parent=parent, training_pipeline=training_pipeline + ) + logging.info("training pipeline request response: %s", response) + + SLEEP_INTERVAL = 100 + + training_pipeline_name = response.name + logging.info("training pipeline name: %s", training_pipeline_name) + # Poll periodically until training completes + while True: + mresponse = client.get_training_pipeline(name=training_pipeline_name) + logging.info('mresponse: %s', mresponse) + logging.info('job state: %s', mresponse.state) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED: + logging.info('training finished') + # write some outputs once finished + model_name = mresponse.model_to_upload.name + logging.info('got model name: %s', model_name) + with open('temp.txt', "w") as outfile: + outfile.write(model_name) + subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + with open('temp2.txt', "w") as outfile: + outfile.write(model_display_name) + subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) + break + else: + time.sleep(SLEEP_INTERVAL) + + import argparse + _parser = argparse.ArgumentParser(prog='Create training pipeline custom job', description='') + _parser.add_argument("--project", dest="project", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-display-name", dest="model_display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--executor-image-uri", dest="executor_image_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--package-uri", dest="package_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--python-module", dest="python_module", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--base-output-directory-prefix", dest="base_output_directory_prefix", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--prediction-image-uri", dest="prediction_image_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--epochs", dest="epochs", type=int, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--data-dir", dest="data_dir", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--steps-per-epoch", dest="steps_per_epoch", type=int, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--hptune-dict", dest="hptune_dict", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-id", dest="model_id", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-dispname", dest="model_dispname", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + + _outputs = create_training_pipeline_custom_job(**_parsed_args) + args: + - --project + - {inputValue: project} + - --display-name + - {inputValue: display_name} + - --model-display-name + - {inputValue: model_display_name} + - --executor-image-uri + - {inputValue: executor_image_uri} + - --package-uri + - {inputValue: package_uri} + - --python-module + - {inputValue: python_module} + - --base-output-directory-prefix + - {inputValue: base_output_directory_prefix} + - --prediction-image-uri + - {inputValue: prediction_image_uri} + - --location + - {inputValue: location} + - --api-endpoint + - {inputValue: api_endpoint} + - --epochs + - {inputValue: epochs} + - --data-dir + - {inputValue: data_dir} + - --steps-per-epoch + - {inputValue: steps_per_epoch} + - --hptune-dict + - {inputValue: hptune_dict} + - --model-id + - {outputPath: model_id} + - --model-dispname + - {outputPath: model_dispname} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml new file mode 100644 index 0000000..e98f80b --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml @@ -0,0 +1,108 @@ +name: Upload model +inputs: +- {name: project, type: String} +- {name: display_name, type: String} +- {name: image_uri, type: String} +- {name: artifact_uri, type: String} +- {name: location, type: String} +- {name: api_endpoint, type: String} +- {name: timeout, type: Integer} +outputs: +- {name: model_id, type: String} +implementation: + container: + image: gcr.io/aju-vtests2/bw-aiplatform:v1 + command: + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + def upload_model( + project, + display_name, + image_uri, + artifact_uri, + location, # "us-central1", + api_endpoint, #"us-central1-aiplatform.googleapis.com", + timeout, # 1800, + model_id + ): + import logging + import subprocess + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + metadata_schema_uri = "" + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.ModelServiceClient(client_options=client_options) + model = { + "display_name": display_name, + "metadata_schema_uri": metadata_schema_uri, + # The artifact_uri should be the path to a GCS directory containing + # saved model artifacts. The bucket must be accessible for the + # project's AI Platform service account and in the same region as + # the api endpoint. + "artifact_uri": artifact_uri, + "container_spec": { + "image_uri": image_uri, + "command": [], + "args": [], + "env": [], + "ports": [], + "predict_route": "", + "health_route": "", + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.upload_model(parent=parent, model=model) + logging.info("Long running operation: %s", response.operation.name) + upload_model_response = response.result(timeout=timeout) + logging.info("upload_model_response: %s", upload_model_response) + model_path = upload_model_response.model + logging.info('got model path: %s', model_path) + with open('temp.txt', "w") as outfile: + outfile.write(model_path) + subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + + import argparse + _parser = argparse.ArgumentParser(prog='Upload model', description='') + _parser.add_argument("--project", dest="project", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--image-uri", dest="image_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--artifact-uri", dest="artifact_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--timeout", dest="timeout", type=int, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-id", dest="model_id", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + + _outputs = upload_model(**_parsed_args) + args: + - --project + - {inputValue: project} + - --display-name + - {inputValue: display_name} + - --image-uri + - {inputValue: image_uri} + - --artifact-uri + - {inputValue: artifact_uri} + - --location + - {inputValue: location} + - --api-endpoint + - {inputValue: api_endpoint} + - --timeout + - {inputValue: timeout} + - --model-id + - {outputPath: model_id} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py new file mode 100644 index 0000000..405531a --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py @@ -0,0 +1,97 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def deploy_model( + project: str, + endpoint_disp_name: str, + model_name: str, + deployed_model_display_name: str, + location: str = "us-central1", + api_endpoint: str = "us-central1-aiplatform.googleapis.com", + timeout: int = 7200, + ): + + import logging + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + + def create_endpoint( + project: str, + display_name: str, + client, + location: str = "us-central1", + api_endpoint: str = "us-central1-aiplatform.googleapis.com", + timeout: int = 300, + ): + + endpoint = {"display_name": display_name} + parent = f"projects/{project}/locations/{location}" + response = client.create_endpoint(parent=parent, endpoint=endpoint) + print("Long running operation:", response.operation.name) + create_endpoint_response = response.result(timeout=timeout) + print("create_endpoint_response:", create_endpoint_response) + endpoint_name = create_endpoint_response.name + logging.info('endpoint name: %s', endpoint_name) + return endpoint_name + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.EndpointServiceClient(client_options=client_options) + + # create endpoint + logging.info('creating endpoint %s', endpoint_disp_name) + endpoint_path = create_endpoint(project, endpoint_disp_name, client) + logging.info("using endpoint path ID %s", endpoint_path) + + deployed_model = { + # format: 'projects/{project}/locations/{location}/models/{model}' + "model": model_name, + "display_name": deployed_model_display_name, + # `dedicated_resources` must be used for non-AutoML models + "dedicated_resources": { + "min_replica_count": 1, + "machine_spec": { + "machine_type": "n1-standard-2", + # Accelerators can be used only if the model specifies a GPU image. + # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + # 'accelerator_count': 1, + }, + }, + } + # key '0' assigns traffic for the newly deployed model + # Traffic percentage values must add up to 100 + # Leave dictionary empty if endpoint should not accept any traffic + traffic_split = {"0": 100} +# endpoint = client.endpoint_path( +# project=project, location=location, endpoint=endpoint_id +# ) + response = client.deploy_model( + # endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split + endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split + ) + logging.info("Long running operation: %s", response.operation.name) + deploy_model_response = response.result(timeout=timeout) + logging.info("deploy_model_response: %s", deploy_model_response) + + +if __name__ == '__main__': +# deploy_model('aju-vtests2', 'endpoint_test2', +# 'projects/467744782358/locations/us-central1/models/6181278449496227840', 'sdk_test1') + import kfp + kfp.components.func_to_container_op(deploy_model, + output_component_file='../model_deploy_component.yaml', + base_image='gcr.io/aju-vtests2/bw-aiplatform:v1') \ No newline at end of file diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py new file mode 100644 index 0000000..a533fef --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py @@ -0,0 +1,81 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from kfp.components import OutputPath + + +def upload_model( + project: str, + display_name: str, + image_uri: str, + artifact_uri: str, + location: str, # "us-central1", + api_endpoint: str, #"us-central1-aiplatform.googleapis.com", + timeout: int, # 1800, + model_id: OutputPath('String') + ): + import logging + import subprocess + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + metadata_schema_uri = "" + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.ModelServiceClient(client_options=client_options) + model = { + "display_name": display_name, + "metadata_schema_uri": metadata_schema_uri, + # The artifact_uri should be the path to a GCS directory containing + # saved model artifacts. The bucket must be accessible for the + # project's AI Platform service account and in the same region as + # the api endpoint. + "artifact_uri": artifact_uri, + "container_spec": { + "image_uri": image_uri, + "command": [], + "args": [], + "env": [], + "ports": [], + "predict_route": "", + "health_route": "", + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.upload_model(parent=parent, model=model) + logging.info("Long running operation: %s", response.operation.name) + upload_model_response = response.result(timeout=timeout) + logging.info("upload_model_response: %s", upload_model_response) + model_path = upload_model_response.model + logging.info('got model path: %s', model_path) + with open('temp.txt', "w") as outfile: + outfile.write(model_path) + subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + + + +if __name__ == '__main__': + # upload_model('aju-vtests2', display_name='sdk_test1', metadata_schema_uri="", + # image_uri='us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest', + # artifact_uri='gs://aju-pipelines/v64/077ae97e-9c6d-4c1c-b5a1-fc2e95fb7dbb/0/bwmodel/trained_model/export/bikesw/1615937808') + import kfp + kfp.components.func_to_container_op(upload_model, + output_component_file='../model_upload_component.yaml', + base_image='gcr.io/aju-vtests2/bw-aiplatform:v1') + + +# gcloud beta ai models upload --region=us-central1 --display-name=bw2 --container-image-uri=us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest --artifact-uri=gs://aju-pipelines/ktune13/f8515c75-32b7-47a4-af70-5ff24362eccc/0/bwmodel/trained_model/export/bikesw/1603733739 \ No newline at end of file diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/__init__.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/model.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/model.py new file mode 100644 index 0000000..1f4e022 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/model.py @@ -0,0 +1,111 @@ +# Copyright 2020 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Adapted in part from: +# https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb +# by Valliappa Lakshmanan. (See that repo for more info about the accompanying book, +# "Data Science on the Google Cloud Platform", from O'Reilly.) + +import tensorflow as tf + + +CSV_COLUMNS = ('duration,end_station_id,bike_id,ts,day_of_week,start_station_id' + + ',start_latitude,start_longitude,end_latitude,end_longitude' + + ',euclidean,loc_cross,prcp,max,min,temp,dewp').split(',') +LABEL_COLUMN = 'duration' +DEFAULTS = [[0.0], ['na'], ['na'], [0.0], ['na'], ['na'], + [0.0], [0.0], [0.0], [0.0], + [0.0], ['na'], [0.0], [0.0], [0.0], [0.0], [0.0]] + +def load_dataset(pattern, batch_size=1): + return tf.data.experimental.make_csv_dataset(pattern, batch_size, CSV_COLUMNS, DEFAULTS) + +def features_and_labels(features): + label = features.pop('duration') # this is what we will train for + features.pop('bike_id') + return features, label + +def read_dataset(pattern, batch_size, mode=tf.estimator.ModeKeys.TRAIN, truncate=None): + dataset = load_dataset(pattern, batch_size) + dataset = dataset.map(features_and_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE) + if mode == tf.estimator.ModeKeys.TRAIN: + dataset = dataset.repeat().shuffle(batch_size*10) + # dataset = dataset.repeat() + dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(1) + if truncate is not None: + dataset = dataset.take(truncate) + return dataset + +def get_layers(): + + # duration,end_station_id,bike_id,ts,day_of_week,start_station_id,start_latitude,start_longitude,end_latitude,end_longitude, + # euclidean,loc_cross,prcp,max,min,temp,dewp + real = { + colname : tf.feature_column.numeric_column(colname) + for colname in + # ('ts,start_latitude,start_longitude,end_latitude,end_longitude,euclidean,prcp,max,min,temp,dewp').split(',') + # ('ts,euclidean,prcp,max,min,temp,dewp').split(',') + ('euclidean,prcp,max,min,temp,dewp').split(',') + } + sparse = { + 'day_of_week': tf.feature_column.categorical_column_with_vocabulary_list('day_of_week', + vocabulary_list='1,2,3,4,5,6,7'.split(',')), + 'end_station_id' : tf.feature_column.categorical_column_with_hash_bucket( + 'end_station_id', hash_bucket_size=800), + 'start_station_id' : tf.feature_column.categorical_column_with_hash_bucket( + 'start_station_id', hash_bucket_size=800), + 'loc_cross' : tf.feature_column.categorical_column_with_hash_bucket( + 'loc_cross', hash_bucket_size=21000), + # 'bike_id' : tf.feature_column.categorical_column_with_hash_bucket('bike_id', hash_bucket_size=14000) + } + inputs = { + colname : tf.keras.layers.Input(name=colname, shape=(), dtype='float32') + for colname in real.keys() + } + inputs.update({'ts': tf.keras.layers.Input(name='ts', shape=(), dtype='float64')}) + inputs.update({ + colname : tf.keras.layers.Input(name=colname, shape=(), dtype='string') + for colname in sparse.keys() + }) + # embed all the sparse columns + embed = { + 'embed_{}'.format(colname) : tf.feature_column.embedding_column(col, 10) + for colname, col in sparse.items() + } + real.update(embed) + # one-hot encode the sparse columns + sparse = { + colname : tf.feature_column.indicator_column(col) + for colname, col in sparse.items() + } + return inputs, sparse, real + +# Build a wide-and-deep model. +def wide_and_deep_classifier(inputs, linear_feature_columns, dnn_feature_columns, + num_hidden_layers, dnn_hidden_units1, learning_rate): + deep = tf.keras.layers.DenseFeatures(dnn_feature_columns, name='deep_inputs')(inputs) + layers = [dnn_hidden_units1] + if num_hidden_layers > 1: + layers += [int(dnn_hidden_units1/(x*2)) for x in range(1, num_hidden_layers)] + for layerno, numnodes in enumerate(layers): + deep = tf.keras.layers.Dense(numnodes, activation='relu', name='dnn_{}'.format(layerno+1))(deep) + wide = tf.keras.layers.DenseFeatures(linear_feature_columns, name='wide_inputs')(inputs) + both = tf.keras.layers.concatenate([deep, wide], name='both') + output = tf.keras.layers.Dense(1, name='dur')(both) + model = tf.keras.Model(inputs, output) + optimizer = tf.keras.optimizers.RMSprop(learning_rate) + model.compile(loss='mse', optimizer=optimizer, + metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError()]) + return model diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py new file mode 100644 index 0000000..b5c64ff --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py @@ -0,0 +1,149 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from kfp.components import OutputPath + +def create_training_pipeline_custom_job( + project: str, + display_name: str, + model_display_name: str, + executor_image_uri: str, + package_uri: str, + python_module: str, + base_output_directory_prefix: str, + prediction_image_uri: str, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' + location: str, # "us-central1" + api_endpoint: str, # "us-central1-aiplatform.googleapis.com", + epochs: int, + data_dir: str, + steps_per_epoch: int, + hptune_dict: str, + model_id: OutputPath('String'), + model_dispname: OutputPath('String') +): + + import logging + import subprocess + import time + + from google.cloud import aiplatform + from google.protobuf import json_format + from google.protobuf.struct_pb2 import Value + from google.cloud.aiplatform_v1beta1.types import pipeline_state + + logging.getLogger().setLevel(logging.INFO) + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.PipelineServiceClient(client_options=client_options) + + training_task_inputs_dict = { + "workerPoolSpecs": [ + { + # "machine_spec": {"machineType": "n1-standard-16"}, + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "python_package_spec": { + "executor_image_uri": executor_image_uri, + "package_uris": [package_uri], + "python_module": python_module, + "args": [f"--epochs={epochs}", f"--data-dir={data_dir}", + f"--steps-per-epoch={steps_per_epoch}", f"--hptune-dict={hptune_dict}"], + }, + } + # { + # "replicaCount": 1, + # "machineSpec": {"machineType": "n1-standard-4"}, + # "containerSpec": { + # # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job + # "imageUri": container_image_uri, + # "args": [ + # # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. + # "--model_dir=$(AIP_MODEL_DIR)", + # ], + # }, + # } + ], + "baseOutputDirectory": { + # The GCS location for outputs must be accessible by the project's AI Platform service account. + "output_uri_prefix": base_output_directory_prefix + }, + } + training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value()) + + training_task_definition = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" + # image_uri = "gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest" + # image_uri = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' + + training_pipeline = { + "display_name": display_name, + "training_task_definition": training_task_definition, + "training_task_inputs": training_task_inputs, + "model_to_upload": { + "display_name": model_display_name, + "container_spec": {"image_uri": prediction_image_uri}, + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.create_training_pipeline( + parent=parent, training_pipeline=training_pipeline + ) + logging.info("training pipeline request response: %s", response) + + SLEEP_INTERVAL = 100 + + training_pipeline_name = response.name + logging.info("training pipeline name: %s", training_pipeline_name) + # Poll periodically until training completes + while True: + mresponse = client.get_training_pipeline(name=training_pipeline_name) + logging.info('mresponse: %s', mresponse) + logging.info('job state: %s', mresponse.state) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED: + logging.info('training finished') + # write some outputs once finished + model_name = mresponse.model_to_upload.name + logging.info('got model name: %s', model_name) + with open('temp.txt', "w") as outfile: + outfile.write(model_name) + subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + with open('temp2.txt', "w") as outfile: + outfile.write(model_display_name) + subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) + break + else: + time.sleep(SLEEP_INTERVAL) + +if __name__ == '__main__': + # create_training_pipeline_custom_job( + # 'aju-vtests2', 'bw_sdktest2', + # 'bw_sdktest2', + # 'us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest', + # 'gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz', + # 'trainer.task', + # 'gs://aju-pipelines/ucaip/test1803_sdk1', + # "us-central1", + # "us-central1-aiplatform.googleapis.com", + # ) + import kfp + kfp.components.func_to_container_op(create_training_pipeline_custom_job, + output_component_file='../model_train_component.yaml', + base_image='gcr.io/aju-vtests2/bw-aiplatform:v1') diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/setup.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/setup.py new file mode 100644 index 0000000..494c34b --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/setup.py @@ -0,0 +1,27 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import find_packages +from setuptools import setup + +REQUIRED_PACKAGES = [] + +setup( + name='bw-trainer', + version='0.1', + install_requires=REQUIRED_PACKAGES, + packages=find_packages(), + include_package_data=True, + description='bikes & weather training application.' +) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/__init__.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py new file mode 100644 index 0000000..88de6a5 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py @@ -0,0 +1,141 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import logging +import json +import os +import time + +import tensorflow as tf + +import bwmodel.model as bwmodel + +DEVELOP_MODE = False +NBUCKETS = 5 # for embeddings +NUM_EXAMPLES = 1000*1000 * 20 # assume 20 million examples + +STRATEGY = tf.distribute.MirroredStrategy() +TRAIN_BATCH_SIZE = 64 * STRATEGY.num_replicas_in_sync + + +def create_model(learning_rate, hidden_size, num_hidden_layers): + + inputs, sparse, real = bwmodel.get_layers() + + logging.info('sparse keys: %s', sparse.keys()) + logging.info('real keys: %s', real.keys()) + + model = None + print('num replicas...') + print(STRATEGY.num_replicas_in_sync) + + with STRATEGY.scope(): # hmmm + model = bwmodel.wide_and_deep_classifier( + inputs, + linear_feature_columns=sparse.values(), + dnn_feature_columns=real.values(), + num_hidden_layers=num_hidden_layers, + dnn_hidden_units1=hidden_size, + learning_rate=learning_rate) + + + model.summary() + return model + +def run_training( + epochs: int, data_dir: str, + steps_per_epoch: int, hptune_dict: str + ): + + if 'AIP_MODEL_DIR' not in os.environ: + raise KeyError( + 'The `AIP_MODEL_DIR` environment variable has not been' + + 'set. See https://cloud.google.com/ai-platform-unified/docs/tutorials/image-recognition-custom/training' + ) + + logging.getLogger().setLevel(logging.INFO) + + # data_dir = 'gs://aju-dev-demos-codelabs/bikes_weather/' + # epochs = 4 + # steps_per_epoch = -1 + logging.info('epochs: %s', epochs) + logging.info('Tensorflow version %s', tf.__version__) + + hptune_info = json.loads(str(args.hptune_dict)) + logging.info('hptune_info: %s', hptune_info) + learning_rate = hptune_info['learning_rate'] + hidden_size = hptune_info['hidden_size'] + num_hidden_layers = hptune_info['num_hidden_layers'] + logging.info('using: learning rate %s, hidden size %s, first hidden layer %s', + learning_rate, hidden_size, num_hidden_layers) + + TRAIN_DATA_PATTERN = data_dir + "train*" + EVAL_DATA_PATTERN = data_dir + "test*" + + # OUTPUT_DIR = '{}/bwmodel/trained_model'.format(args.workdir) + OUTPUT_DIR = os.environ['AIP_MODEL_DIR'] + logging.info('Writing trained model to %s', OUTPUT_DIR) + + train_batch_size = TRAIN_BATCH_SIZE + eval_batch_size = 1000 + if steps_per_epoch == -1: # calc based on dataset size + steps_per_epoch = NUM_EXAMPLES // train_batch_size + else: + steps_per_epoch = steps_per_epoch + logging.info('using %s steps per epoch', steps_per_epoch) + + train_dataset = bwmodel.read_dataset(TRAIN_DATA_PATTERN, train_batch_size) + eval_dataset = bwmodel.read_dataset(EVAL_DATA_PATTERN, eval_batch_size, + tf.estimator.ModeKeys.EVAL, eval_batch_size * 100 * STRATEGY.num_replicas_in_sync + ) + + model = create_model(learning_rate, hidden_size, num_hidden_layers) + + checkpoint_path = '{}checkpoints/bikes_weather.cpt'.format(OUTPUT_DIR) + logging.info("checkpoint path: %s", checkpoint_path) + cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, + save_weights_only=True, + verbose=1) + tb_callback = tf.keras.callbacks.TensorBoard(log_dir='{}/logs'.format(OUTPUT_DIR), + update_freq=20000) + + logging.info("training model....") + history = model.fit(train_dataset, + validation_data=eval_dataset, + validation_steps=eval_batch_size, + epochs=epochs, + steps_per_epoch=steps_per_epoch, + callbacks=[cp_callback, tb_callback] + ) + + tf.saved_model.save(model, OUTPUT_DIR) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + # Input Arguments + parser.add_argument( + '--epochs', type=int, default=5) + parser.add_argument( + # e.g. {"num_hidden_layers": 3, "hidden_size": 96, "learning_rate": 0.01} + '--hptune-dict', required=True) + parser.add_argument( + '--steps-per-epoch', type=int, + default=-1) # if set to -1, don't override the normal calcs for this + parser.add_argument( + '--data-dir', default='gs://aju-dev-demos-codelabs/bikes_weather/') + args = parser.parse_args() + + run_training(args.epochs, args.data_dir, args.steps_per_epoch, args.hptune_dict) From f51e2a657aad067498bef530ec164d06e7e2c07a Mon Sep 17 00:00:00 2001 From: Amy Unruh Date: Tue, 23 Mar 2021 10:31:57 -0700 Subject: [PATCH 2/4] checkpointing --- .../components/ucaip/bw_ucaip_train_pl.py | 14 +- .../ucaip/model_train_component.yaml | 132 ++++--- .../ucaip/model_upload_component.yaml | 37 +- .../components/ucaip/serving/model_upload.py | 17 +- .../ucaip/training/create_training_job.py | 93 ++--- .../components/ucaip/training/trainer/task.py | 30 +- .../ucaip/ucaip_train_pipeline_spec.json | 321 ++++++++++++++++++ 7 files changed, 516 insertions(+), 128 deletions(-) create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/ucaip_train_pipeline_spec.json diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py index feaf33e..cfdf41c 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py @@ -32,31 +32,33 @@ ) def model_train_pipeline( prediction_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest', - # artifact_uri: str = 'gs://aju-pipelines/v64/077ae97e-9c6d-4c1c-b5a1-fc2e95fb7dbb/0/bwmodel/trained_model/export/bikesw/1615937808', location: str = "us-central1", api_endpoint: str = "us-central1-aiplatform.googleapis.com", project: str = 'aju-vtests2', training_display_name: str = 'CHANGE THIS', model_display_name: str = 'CHANGE THIS', endpoint_disp_name: str = 'CHANGE THIS', + train_container_type: str = 'prebuilt', executor_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest', package_uri: str = 'gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz', python_module: str = 'trainer.task', + container_image_uri: str = '', base_output_directory_prefix: str = 'gs://aju-pipelines/ucaip/training2/', timeout: int = 7200, - epochs: int = 5, - steps_per_epoch: int = -1, - hptune_dict: str = '{"num_hidden_layers": 3, "hidden_size": 32, "learning_rate": 0.01}', + hptune_dict: str = '{"num_hidden_layers": 3, "hidden_size": 32, "learning_rate": 0.01, "epochs": 3, "steps_per_epoch": -1}', data_dir: str = 'gs://aju-dev-demos-codelabs/bikes_weather/' ): model_train = model_train_op( project, training_display_name, model_display_name, + train_container_type, executor_image_uri, package_uri, python_module, + container_image_uri, base_output_directory_prefix, prediction_image_uri, - location, api_endpoint, epochs, - data_dir, steps_per_epoch, hptune_dict + location, api_endpoint, + data_dir, + hptune_dict ) model_deploy = model_deploy_op( diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml index 72c8e32..3c46b74 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml @@ -3,16 +3,16 @@ inputs: - {name: project, type: String} - {name: display_name, type: String} - {name: model_display_name, type: String} +- {name: train_container_type, type: String} - {name: executor_image_uri, type: String} - {name: package_uri, type: String} - {name: python_module, type: String} +- {name: container_image_uri, type: String} - {name: base_output_directory_prefix, type: String} - {name: prediction_image_uri, type: String} - {name: location, type: String} - {name: api_endpoint, type: String} -- {name: epochs, type: Integer} - {name: data_dir, type: String} -- {name: steps_per_epoch, type: Integer} - {name: hptune_dict, type: String} outputs: - {name: model_id, type: String} @@ -28,28 +28,23 @@ implementation: printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | - def _make_parent_dirs_and_return_path(file_path: str): - import os - os.makedirs(os.path.dirname(file_path), exist_ok=True) - return file_path - def create_training_pipeline_custom_job( project, display_name, model_display_name, + train_container_type, executor_image_uri, package_uri, python_module, + container_image_uri, base_output_directory_prefix, prediction_image_uri, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' location, # "us-central1" api_endpoint, # "us-central1-aiplatform.googleapis.com", - epochs, data_dir, - steps_per_epoch, hptune_dict, - model_id, - model_dispname + # model_id: OutputPath('String'), + # model_dispname: OutputPath('String') ): import logging @@ -69,36 +64,43 @@ implementation: # This client only needs to be created once, and can be reused for multiple requests. client = aiplatform.gapic.PipelineServiceClient(client_options=client_options) - training_task_inputs_dict = { - "workerPoolSpecs": [ - { - # "machine_spec": {"machineType": "n1-standard-16"}, + if train_container_type == 'prebuilt': + python_package_spec = { + "executor_image_uri": executor_image_uri, + "package_uris": [package_uri], + "python_module": python_module, + "args": [f"--data-dir={data_dir}", + f"--hptune-dict={hptune_dict}"]} + worker_pool_spec = { + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "python_package_spec": python_package_spec, + } + elif train_container_type == 'custom': + container_spec = { + # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job + "imageUri": container_image_uri, + "args": [ + # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. + "--model_dir=$(AIP_MODEL_DIR)", + ]} + worker_pool_spec = { "machine_spec": { "machine_type": "n1-standard-16", "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, "accelerator_count": 2, }, "replica_count": 1, - "python_package_spec": { - "executor_image_uri": executor_image_uri, - "package_uris": [package_uri], - "python_module": python_module, - "args": [f"--epochs={epochs}", f"--data-dir={data_dir}", - f"--steps-per-epoch={steps_per_epoch}", f"--hptune-dict={hptune_dict}"], - }, + "container_spec": container_spec, } - # { - # "replicaCount": 1, - # "machineSpec": {"machineType": "n1-standard-4"}, - # "containerSpec": { - # # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job - # "imageUri": container_image_uri, - # "args": [ - # # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. - # "--model_dir=$(AIP_MODEL_DIR)", - # ], - # }, - # } + + training_task_inputs_dict = { + "workerPoolSpecs": [ + worker_pool_spec ], "baseOutputDirectory": { # The GCS location for outputs must be accessible by the project's AI Platform service account. @@ -135,42 +137,67 @@ implementation: mresponse = client.get_training_pipeline(name=training_pipeline_name) logging.info('mresponse: %s', mresponse) logging.info('job state: %s', mresponse.state) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED: + logging.warning('training pipeline failed: %s', mresponse) + break if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED: logging.info('training finished') - # write some outputs once finished model_name = mresponse.model_to_upload.name - logging.info('got model name: %s', model_name) - with open('temp.txt', "w") as outfile: - outfile.write(model_name) - subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) - with open('temp2.txt', "w") as outfile: - outfile.write(model_display_name) - subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) - break + return (model_name, model_display_name) + # # write some outputs once finished + # model_name = mresponse.model_to_upload.name + # logging.info('got model name: %s', model_name) + # with open('temp.txt', "w") as outfile: + # outfile.write(model_name) + # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + # with open('temp2.txt', "w") as outfile: + # outfile.write(model_display_name) + # subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) + # break else: time.sleep(SLEEP_INTERVAL) + def _serialize_str(str_value: str) -> str: + if not isinstance(str_value, str): + raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value)))) + return str_value + import argparse _parser = argparse.ArgumentParser(prog='Create training pipeline custom job', description='') _parser.add_argument("--project", dest="project", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model-display-name", dest="model_display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--train-container-type", dest="train_container_type", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--executor-image-uri", dest="executor_image_uri", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--package-uri", dest="package_uri", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--python-module", dest="python_module", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--container-image-uri", dest="container_image_uri", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--base-output-directory-prefix", dest="base_output_directory_prefix", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--prediction-image-uri", dest="prediction_image_uri", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--location", dest="location", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=True, default=argparse.SUPPRESS) - _parser.add_argument("--epochs", dest="epochs", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--data-dir", dest="data_dir", type=str, required=True, default=argparse.SUPPRESS) - _parser.add_argument("--steps-per-epoch", dest="steps_per_epoch", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--hptune-dict", dest="hptune_dict", type=str, required=True, default=argparse.SUPPRESS) - _parser.add_argument("--model-id", dest="model_id", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) - _parser.add_argument("--model-dispname", dest="model_dispname", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) _outputs = create_training_pipeline_custom_job(**_parsed_args) + + _output_serializers = [ + _serialize_str, + _serialize_str, + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) args: - --project - {inputValue: project} @@ -178,12 +205,16 @@ implementation: - {inputValue: display_name} - --model-display-name - {inputValue: model_display_name} + - --train-container-type + - {inputValue: train_container_type} - --executor-image-uri - {inputValue: executor_image_uri} - --package-uri - {inputValue: package_uri} - --python-module - {inputValue: python_module} + - --container-image-uri + - {inputValue: container_image_uri} - --base-output-directory-prefix - {inputValue: base_output_directory_prefix} - --prediction-image-uri @@ -192,15 +223,10 @@ implementation: - {inputValue: location} - --api-endpoint - {inputValue: api_endpoint} - - --epochs - - {inputValue: epochs} - --data-dir - {inputValue: data_dir} - - --steps-per-epoch - - {inputValue: steps_per_epoch} - --hptune-dict - {inputValue: hptune_dict} - - --model-id + - '----output-paths' - {outputPath: model_id} - - --model-dispname - {outputPath: model_dispname} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml index e98f80b..1269f29 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml @@ -20,11 +20,6 @@ implementation: printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | - def _make_parent_dirs_and_return_path(file_path: str): - import os - os.makedirs(os.path.dirname(file_path), exist_ok=True) - return file_path - def upload_model( project, display_name, @@ -33,8 +28,9 @@ implementation: location, # "us-central1", api_endpoint, #"us-central1-aiplatform.googleapis.com", timeout, # 1800, - model_id + # model_id: OutputPath('String') ): + import logging import subprocess from google.cloud import aiplatform @@ -71,10 +67,12 @@ implementation: upload_model_response = response.result(timeout=timeout) logging.info("upload_model_response: %s", upload_model_response) model_path = upload_model_response.model - logging.info('got model path: %s', model_path) - with open('temp.txt', "w") as outfile: - outfile.write(model_path) - subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + return (model_path, ) + + def _serialize_str(str_value: str) -> str: + if not isinstance(str_value, str): + raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value)))) + return str_value import argparse _parser = argparse.ArgumentParser(prog='Upload model', description='') @@ -85,10 +83,25 @@ implementation: _parser.add_argument("--location", dest="location", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--timeout", dest="timeout", type=int, required=True, default=argparse.SUPPRESS) - _parser.add_argument("--model-id", dest="model_id", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) _outputs = upload_model(**_parsed_args) + + _output_serializers = [ + _serialize_str, + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) args: - --project - {inputValue: project} @@ -104,5 +117,5 @@ implementation: - {inputValue: api_endpoint} - --timeout - {inputValue: timeout} - - --model-id + - '----output-paths' - {outputPath: model_id} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py index a533fef..f275713 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from kfp.components import OutputPath +# from kfp.components import OutputPath +from typing import NamedTuple def upload_model( @@ -23,8 +24,9 @@ def upload_model( location: str, # "us-central1", api_endpoint: str, #"us-central1-aiplatform.googleapis.com", timeout: int, # 1800, - model_id: OutputPath('String') - ): + # model_id: OutputPath('String') + ) -> NamedTuple('Outputs', [('model_id', str)]): + import logging import subprocess from google.cloud import aiplatform @@ -61,10 +63,11 @@ def upload_model( upload_model_response = response.result(timeout=timeout) logging.info("upload_model_response: %s", upload_model_response) model_path = upload_model_response.model - logging.info('got model path: %s', model_path) - with open('temp.txt', "w") as outfile: - outfile.write(model_path) - subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + return (model_path, ) + # logging.info('got model path: %s', model_path) + # with open('temp.txt', "w") as outfile: + # outfile.write(model_path) + # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py index b5c64ff..46756cd 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py @@ -13,26 +13,28 @@ # limitations under the License. -from kfp.components import OutputPath +# from kfp.components import OutputPath +from typing import NamedTuple + def create_training_pipeline_custom_job( project: str, display_name: str, model_display_name: str, + train_container_type: str, executor_image_uri: str, package_uri: str, python_module: str, + container_image_uri: str, base_output_directory_prefix: str, prediction_image_uri: str, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' location: str, # "us-central1" api_endpoint: str, # "us-central1-aiplatform.googleapis.com", - epochs: int, data_dir: str, - steps_per_epoch: int, hptune_dict: str, - model_id: OutputPath('String'), - model_dispname: OutputPath('String') -): + # model_id: OutputPath('String'), + # model_dispname: OutputPath('String') +) -> NamedTuple('Outputs', [('model_id', str), ('model_dispname', str)]): import logging import subprocess @@ -51,36 +53,46 @@ def create_training_pipeline_custom_job( # This client only needs to be created once, and can be reused for multiple requests. client = aiplatform.gapic.PipelineServiceClient(client_options=client_options) - training_task_inputs_dict = { - "workerPoolSpecs": [ - { - # "machine_spec": {"machineType": "n1-standard-16"}, + if train_container_type == 'prebuilt': + python_package_spec = { + "executor_image_uri": executor_image_uri, + "package_uris": [package_uri], + "python_module": python_module, + "args": [f"--data-dir={data_dir}", + f"--hptune-dict={hptune_dict}"]} + worker_pool_spec = { "machine_spec": { "machine_type": "n1-standard-16", "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, "accelerator_count": 2, }, "replica_count": 1, - "python_package_spec": { - "executor_image_uri": executor_image_uri, - "package_uris": [package_uri], - "python_module": python_module, - "args": [f"--epochs={epochs}", f"--data-dir={data_dir}", - f"--steps-per-epoch={steps_per_epoch}", f"--hptune-dict={hptune_dict}"], - }, + "python_package_spec": python_package_spec, } - # { - # "replicaCount": 1, - # "machineSpec": {"machineType": "n1-standard-4"}, - # "containerSpec": { - # # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job - # "imageUri": container_image_uri, - # "args": [ - # # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. - # "--model_dir=$(AIP_MODEL_DIR)", - # ], - # }, - # } + elif train_container_type == 'custom': + container_spec = { + # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job + "imageUri": container_image_uri, + "args": [ + # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. + "--model_dir=$(AIP_MODEL_DIR)", + ]} + worker_pool_spec = { + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "container_spec": container_spec, + } + else: + logging.warning('unknown train_container_type; exiting') + exit(1) + + training_task_inputs_dict = { + "workerPoolSpecs": [ + worker_pool_spec ], "baseOutputDirectory": { # The GCS location for outputs must be accessible by the project's AI Platform service account. @@ -117,18 +129,23 @@ def create_training_pipeline_custom_job( mresponse = client.get_training_pipeline(name=training_pipeline_name) logging.info('mresponse: %s', mresponse) logging.info('job state: %s', mresponse.state) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED: + logging.warning('training pipeline failed: %s', mresponse) + exit(1) if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED: logging.info('training finished') - # write some outputs once finished model_name = mresponse.model_to_upload.name - logging.info('got model name: %s', model_name) - with open('temp.txt', "w") as outfile: - outfile.write(model_name) - subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) - with open('temp2.txt', "w") as outfile: - outfile.write(model_display_name) - subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) - break + return (model_name, model_display_name) + # # write some outputs once finished + # model_name = mresponse.model_to_upload.name + # logging.info('got model name: %s', model_name) + # with open('temp.txt', "w") as outfile: + # outfile.write(model_name) + # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) + # with open('temp2.txt', "w") as outfile: + # outfile.write(model_display_name) + # subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) + # break else: time.sleep(SLEEP_INTERVAL) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py index 88de6a5..85e4218 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py @@ -56,8 +56,10 @@ def create_model(learning_rate, hidden_size, num_hidden_layers): return model def run_training( - epochs: int, data_dir: str, - steps_per_epoch: int, hptune_dict: str + # epochs: int, + data_dir: str, + # steps_per_epoch: int, + hptune_dict: str ): if 'AIP_MODEL_DIR' not in os.environ: @@ -69,18 +71,18 @@ def run_training( logging.getLogger().setLevel(logging.INFO) # data_dir = 'gs://aju-dev-demos-codelabs/bikes_weather/' - # epochs = 4 - # steps_per_epoch = -1 - logging.info('epochs: %s', epochs) - logging.info('Tensorflow version %s', tf.__version__) hptune_info = json.loads(str(args.hptune_dict)) logging.info('hptune_info: %s', hptune_info) learning_rate = hptune_info['learning_rate'] hidden_size = hptune_info['hidden_size'] num_hidden_layers = hptune_info['num_hidden_layers'] + epochs = hptune_info['epochs'] + steps_per_epoch = hptune_info['steps_per_epoch'] logging.info('using: learning rate %s, hidden size %s, first hidden layer %s', learning_rate, hidden_size, num_hidden_layers) + logging.info('epochs: %s', epochs) + logging.info('Tensorflow version %s', tf.__version__) TRAIN_DATA_PATTERN = data_dir + "train*" EVAL_DATA_PATTERN = data_dir + "test*" @@ -126,16 +128,20 @@ def run_training( if __name__ == '__main__': parser = argparse.ArgumentParser() # Input Arguments - parser.add_argument( - '--epochs', type=int, default=5) + # parser.add_argument( + # '--epochs', type=int, default=5) parser.add_argument( # e.g. {"num_hidden_layers": 3, "hidden_size": 96, "learning_rate": 0.01} '--hptune-dict', required=True) - parser.add_argument( - '--steps-per-epoch', type=int, - default=-1) # if set to -1, don't override the normal calcs for this + # parser.add_argument( + # '--steps-per-epoch', type=int, + # default=-1) # if set to -1, don't override the normal calcs for this parser.add_argument( '--data-dir', default='gs://aju-dev-demos-codelabs/bikes_weather/') args = parser.parse_args() - run_training(args.epochs, args.data_dir, args.steps_per_epoch, args.hptune_dict) + run_training( + # args.epochs, + args.data_dir, + # args.steps_per_epoch, + args.hptune_dict) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/ucaip_train_pipeline_spec.json b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/ucaip_train_pipeline_spec.json new file mode 100644 index 0000000..8b57ece --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/ucaip_train_pipeline_spec.json @@ -0,0 +1,321 @@ +{ + "pipelineSpec": { + "runtimeParameters": { + "train_container_type": { + "defaultValue": { + "stringValue": "prebuilt" + }, + "type": "STRING" + }, + "hptune_dict": { + "type": "STRING", + "defaultValue": { + "stringValue": "{\"num_hidden_layers\": 3, \"hidden_size\": 32, \"learning_rate\": 0.01, \"epochs\": 3, \"steps_per_epoch\": -1}" + } + }, + "timeout": { + "type": "INT", + "defaultValue": { + "intValue": "7200" + } + }, + "project": { + "type": "STRING", + "defaultValue": { + "stringValue": "aju-vtests2" + } + }, + "python_module": { + "type": "STRING", + "defaultValue": { + "stringValue": "trainer.task" + } + }, + "package_uri": { + "defaultValue": { + "stringValue": "gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz" + }, + "type": "STRING" + }, + "container_image_uri": { + "type": "STRING" + }, + "prediction_image_uri": { + "defaultValue": { + "stringValue": "us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest" + }, + "type": "STRING" + }, + "executor_image_uri": { + "type": "STRING", + "defaultValue": { + "stringValue": "us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest" + } + }, + "endpoint_disp_name": { + "defaultValue": { + "stringValue": "CHANGE THIS" + }, + "type": "STRING" + }, + "base_output_directory_prefix": { + "defaultValue": { + "stringValue": "gs://aju-pipelines/ucaip/training2/" + }, + "type": "STRING" + }, + "model_display_name": { + "type": "STRING", + "defaultValue": { + "stringValue": "CHANGE THIS" + } + }, + "api_endpoint": { + "type": "STRING", + "defaultValue": { + "stringValue": "us-central1-aiplatform.googleapis.com" + } + }, + "data_dir": { + "type": "STRING", + "defaultValue": { + "stringValue": "gs://aju-dev-demos-codelabs/bikes_weather/" + } + }, + "location": { + "type": "STRING", + "defaultValue": { + "stringValue": "us-central1" + } + }, + "training_display_name": { + "defaultValue": { + "stringValue": "CHANGE THIS" + }, + "type": "STRING" + } + }, + "sdkVersion": "kfp-1.4.0", + "schemaVersion": "v2alpha1", + "tasks": [ + { + "outputs": { + "parameters": { + "model_id": { + "type": "STRING" + }, + "model_dispname": { + "type": "STRING" + } + } + }, + "taskInfo": { + "name": "Create training pipeline custom job" + }, + "inputs": { + "parameters": { + "project": { + "runtimeValue": { + "runtimeParameter": "project" + } + }, + "base_output_directory_prefix": { + "runtimeValue": { + "runtimeParameter": "base_output_directory_prefix" + } + }, + "python_module": { + "runtimeValue": { + "runtimeParameter": "python_module" + } + }, + "container_image_uri": { + "runtimeValue": { + "runtimeParameter": "container_image_uri" + } + }, + "display_name": { + "runtimeValue": { + "runtimeParameter": "training_display_name" + } + }, + "location": { + "runtimeValue": { + "runtimeParameter": "location" + } + }, + "api_endpoint": { + "runtimeValue": { + "runtimeParameter": "api_endpoint" + } + }, + "hptune_dict": { + "runtimeValue": { + "runtimeParameter": "hptune_dict" + } + }, + "train_container_type": { + "runtimeValue": { + "runtimeParameter": "train_container_type" + } + }, + "executor_image_uri": { + "runtimeValue": { + "runtimeParameter": "executor_image_uri" + } + }, + "model_display_name": { + "runtimeValue": { + "runtimeParameter": "model_display_name" + } + }, + "prediction_image_uri": { + "runtimeValue": { + "runtimeParameter": "prediction_image_uri" + } + }, + "data_dir": { + "runtimeValue": { + "runtimeParameter": "data_dir" + } + }, + "package_uri": { + "runtimeValue": { + "runtimeParameter": "package_uri" + } + } + } + }, + "executorLabel": "Create training pipeline custom job" + }, + { + "executorLabel": "Deploy model", + "inputs": { + "parameters": { + "endpoint_disp_name": { + "runtimeValue": { + "runtimeParameter": "endpoint_disp_name" + } + }, + "deployed_model_display_name": { + "runtimeValue": { + "runtimeParameter": "model_display_name" + } + }, + "timeout": { + "runtimeValue": { + "runtimeParameter": "timeout" + } + }, + "api_endpoint": { + "runtimeValue": { + "runtimeParameter": "api_endpoint" + } + }, + "project": { + "runtimeValue": { + "runtimeParameter": "project" + } + }, + "location": { + "runtimeValue": { + "runtimeParameter": "location" + } + }, + "model_name": { + "taskOutputParameter": { + "outputParameterKey": "model_id", + "producerTask": "Create training pipeline custom job" + } + } + } + }, + "taskInfo": { + "name": "Deploy model" + } + } + ], + "deploymentConfig": { + "executors": { + "Create training pipeline custom job": { + "container": { + "command": [ + "sh", + "-ec", + "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def create_training_pipeline_custom_job(\n project,\n display_name,\n model_display_name,\n train_container_type,\n executor_image_uri,\n package_uri,\n python_module,\n container_image_uri,\n base_output_directory_prefix,\n prediction_image_uri, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'\n location, # \"us-central1\"\n api_endpoint, # \"us-central1-aiplatform.googleapis.com\",\n data_dir,\n hptune_dict,\n # model_id: OutputPath('String'),\n # model_dispname: OutputPath('String')\n):\n\n import logging\n import subprocess\n import time\n\n from google.cloud import aiplatform\n from google.protobuf import json_format\n from google.protobuf.struct_pb2 import Value\n from google.cloud.aiplatform_v1beta1.types import pipeline_state\n\n logging.getLogger().setLevel(logging.INFO)\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.PipelineServiceClient(client_options=client_options)\n\n if train_container_type == 'prebuilt':\n python_package_spec = {\n \"executor_image_uri\": executor_image_uri,\n \"package_uris\": [package_uri],\n \"python_module\": python_module,\n \"args\": [f\"--data-dir={data_dir}\",\n f\"--hptune-dict={hptune_dict}\"]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"python_package_spec\": python_package_spec,\n }\n elif train_container_type == 'custom':\n container_spec = {\n # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job\n \"imageUri\": container_image_uri,\n \"args\": [\n # AIP_MODEL_DIR is set by the service according to baseOutputDirectory.\n \"--model_dir=$(AIP_MODEL_DIR)\",\n ]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"container_spec\": container_spec,\n }\n\n training_task_inputs_dict = {\n \"workerPoolSpecs\": [\n worker_pool_spec\n ],\n \"baseOutputDirectory\": {\n # The GCS location for outputs must be accessible by the project's AI Platform service account.\n \"output_uri_prefix\": base_output_directory_prefix\n },\n }\n training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value())\n\n training_task_definition = \"gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml\"\n # image_uri = \"gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest\"\n # image_uri = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'\n\n training_pipeline = {\n \"display_name\": display_name,\n \"training_task_definition\": training_task_definition,\n \"training_task_inputs\": training_task_inputs,\n \"model_to_upload\": {\n \"display_name\": model_display_name,\n \"container_spec\": {\"image_uri\": prediction_image_uri},\n },\n }\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_training_pipeline(\n parent=parent, training_pipeline=training_pipeline\n )\n logging.info(\"training pipeline request response: %s\", response)\n\n SLEEP_INTERVAL = 100\n\n training_pipeline_name = response.name\n logging.info(\"training pipeline name: %s\", training_pipeline_name)\n # Poll periodically until training completes\n while True:\n mresponse = client.get_training_pipeline(name=training_pipeline_name)\n logging.info('mresponse: %s', mresponse)\n logging.info('job state: %s', mresponse.state)\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED:\n logging.warning('training pipeline failed: %s', mresponse)\n break\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED:\n logging.info('training finished')\n model_name = mresponse.model_to_upload.name\n return (model_name, model_display_name)\n # # write some outputs once finished\n # model_name = mresponse.model_to_upload.name\n # logging.info('got model name: %s', model_name)\n # with open('temp.txt', \"w\") as outfile:\n # outfile.write(model_name)\n # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id])\n # with open('temp2.txt', \"w\") as outfile:\n # outfile.write(model_display_name)\n # subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname])\n # break\n else:\n time.sleep(SLEEP_INTERVAL)\n\ndef _serialize_str(str_value: str) -> str:\n if not isinstance(str_value, str):\n raise TypeError('Value \"{}\" has type \"{}\" instead of str.'.format(str(str_value), str(type(str_value))))\n return str_value\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Create training pipeline custom job', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--display-name\", dest=\"display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-display-name\", dest=\"model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-container-type\", dest=\"train_container_type\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--executor-image-uri\", dest=\"executor_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--package-uri\", dest=\"package_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--python-module\", dest=\"python_module\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--container-image-uri\", dest=\"container_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--base-output-directory-prefix\", dest=\"base_output_directory_prefix\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--prediction-image-uri\", dest=\"prediction_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--data-dir\", dest=\"data_dir\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--hptune-dict\", dest=\"hptune_dict\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=2)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = create_training_pipeline_custom_job(**_parsed_args)\n\n_output_serializers = [\n _serialize_str,\n _serialize_str,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, 'w') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n" + ], + "image": "gcr.io/aju-vtests2/bw-aiplatform:v1", + "args": [ + "--project", + "{{$.inputs.parameters['project']}}", + "--display-name", + "{{$.inputs.parameters['display_name']}}", + "--model-display-name", + "{{$.inputs.parameters['model_display_name']}}", + "--train-container-type", + "{{$.inputs.parameters['train_container_type']}}", + "--executor-image-uri", + "{{$.inputs.parameters['executor_image_uri']}}", + "--package-uri", + "{{$.inputs.parameters['package_uri']}}", + "--python-module", + "{{$.inputs.parameters['python_module']}}", + "--container-image-uri", + "{{$.inputs.parameters['container_image_uri']}}", + "--base-output-directory-prefix", + "{{$.inputs.parameters['base_output_directory_prefix']}}", + "--prediction-image-uri", + "{{$.inputs.parameters['prediction_image_uri']}}", + "--location", + "{{$.inputs.parameters['location']}}", + "--api-endpoint", + "{{$.inputs.parameters['api_endpoint']}}", + "--data-dir", + "{{$.inputs.parameters['data_dir']}}", + "--hptune-dict", + "{{$.inputs.parameters['hptune_dict']}}", + "----output-paths", + "{{$.outputs.parameters['model_id'].output_file}}", + "{{$.outputs.parameters['model_dispname'].output_file}}" + ] + } + }, + "Deploy model": { + "container": { + "args": [ + "--project", + "{{$.inputs.parameters['project']}}", + "--endpoint-disp-name", + "{{$.inputs.parameters['endpoint_disp_name']}}", + "--model-name", + "{{$.inputs.parameters['model_name']}}", + "--deployed-model-display-name", + "{{$.inputs.parameters['deployed_model_display_name']}}", + "--location", + "{{$.inputs.parameters['location']}}", + "--api-endpoint", + "{{$.inputs.parameters['api_endpoint']}}", + "--timeout", + "{{$.inputs.parameters['timeout']}}" + ], + "command": [ + "sh", + "-ec", + "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def deploy_model(\n project,\n endpoint_disp_name,\n model_name,\n deployed_model_display_name,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 7200,\n ):\n\n import logging\n from google.cloud import aiplatform\n\n logging.getLogger().setLevel(logging.INFO)\n\n def create_endpoint(\n project,\n display_name,\n client,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 300,\n ):\n\n endpoint = {\"display_name\": display_name}\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_endpoint(parent=parent, endpoint=endpoint)\n print(\"Long running operation:\", response.operation.name)\n create_endpoint_response = response.result(timeout=timeout)\n print(\"create_endpoint_response:\", create_endpoint_response)\n endpoint_name = create_endpoint_response.name\n logging.info('endpoint name: %s', endpoint_name)\n return endpoint_name\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.EndpointServiceClient(client_options=client_options)\n\n # create endpoint\n logging.info('creating endpoint %s', endpoint_disp_name)\n endpoint_path = create_endpoint(project, endpoint_disp_name, client)\n logging.info(\"using endpoint path ID %s\", endpoint_path)\n\n deployed_model = {\n # format: 'projects/{project}/locations/{location}/models/{model}'\n \"model\": model_name,\n \"display_name\": deployed_model_display_name,\n # `dedicated_resources` must be used for non-AutoML models\n \"dedicated_resources\": {\n \"min_replica_count\": 1,\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-2\",\n # Accelerators can be used only if the model specifies a GPU image.\n # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n # 'accelerator_count': 1,\n },\n },\n }\n # key '0' assigns traffic for the newly deployed model\n # Traffic percentage values must add up to 100\n # Leave dictionary empty if endpoint should not accept any traffic\n traffic_split = {\"0\": 100}\n# endpoint = client.endpoint_path(\n# project=project, location=location, endpoint=endpoint_id\n# )\n response = client.deploy_model(\n # endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split\n endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split\n )\n logging.info(\"Long running operation: %s\", response.operation.name)\n deploy_model_response = response.result(timeout=timeout)\n logging.info(\"deploy_model_response: %s\", deploy_model_response)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Deploy model', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--endpoint-disp-name\", dest=\"endpoint_disp_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--deployed-model-display-name\", dest=\"deployed_model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--timeout\", dest=\"timeout\", type=int, required=False, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = deploy_model(**_parsed_args)\n" + ], + "image": "gcr.io/aju-vtests2/bw-aiplatform:v1" + } + } + }, + "@type": "type.googleapis.com/ml_pipelines.PipelineDeploymentConfig" + }, + "pipelineInfo": { + "name": "ucaip-model-train" + } + }, + "runtimeConfig": { + "gcsOutputDirectory": "gs://aju-pipelines/pipeline_root/ucaiptests" + } +} \ No newline at end of file From 5635da43c4d302566848f9646c8fefc88f4d7bb1 Mon Sep 17 00:00:00 2001 From: Amy Unruh Date: Wed, 24 Mar 2021 11:40:23 -0700 Subject: [PATCH 3/4] cleanup, add example training package dist, changed container URI --- .../ucaip/model_deploy_component.yaml | 3 +- .../ucaip/model_train_component.yaml | 21 +- .../ucaip/model_upload_component.yaml | 3 +- .../components/ucaip/serving/deploy_model.py | 6 +- .../components/ucaip/serving/model_upload.py | 10 +- .../ucaip/training/create_training_job.py | 26 +- .../ucaip/training/dist/bw-trainer-0.1.tar.gz | Bin 0 -> 4427 bytes .../ucaip/ucaip_train_pipeline_spec.json | 326 +++++++++--------- 8 files changed, 174 insertions(+), 221 deletions(-) create mode 100644 ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/dist/bw-trainer-0.1.tar.gz diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml index f7bb9b4..8761f8c 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml @@ -10,7 +10,7 @@ inputs: - {name: timeout, type: Integer, default: '7200', optional: true} implementation: container: - image: gcr.io/aju-vtests2/bw-aiplatform:v1 + image: gcr.io/google-samples/bw-aiplatform:v1 command: - sh - -ec @@ -87,7 +87,6 @@ implementation: # project=project, location=location, endpoint=endpoint_id # ) response = client.deploy_model( - # endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split ) logging.info("Long running operation: %s", response.operation.name) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml index 3c46b74..eece276 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml @@ -19,7 +19,7 @@ outputs: - {name: model_dispname, type: String} implementation: container: - image: gcr.io/aju-vtests2/bw-aiplatform:v1 + image: gcr.io/google-samples/bw-aiplatform:v1 command: - sh - -ec @@ -43,8 +43,6 @@ implementation: api_endpoint, # "us-central1-aiplatform.googleapis.com", data_dir, hptune_dict, - # model_id: OutputPath('String'), - # model_dispname: OutputPath('String') ): import logging @@ -97,6 +95,9 @@ implementation: "replica_count": 1, "container_spec": container_spec, } + else: + logging.warning('unknown train_container_type; exiting') + exit(1) training_task_inputs_dict = { "workerPoolSpecs": [ @@ -110,8 +111,6 @@ implementation: training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value()) training_task_definition = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" - # image_uri = "gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest" - # image_uri = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' training_pipeline = { "display_name": display_name, @@ -139,21 +138,11 @@ implementation: logging.info('job state: %s', mresponse.state) if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED: logging.warning('training pipeline failed: %s', mresponse) - break + exit(1) if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED: logging.info('training finished') model_name = mresponse.model_to_upload.name return (model_name, model_display_name) - # # write some outputs once finished - # model_name = mresponse.model_to_upload.name - # logging.info('got model name: %s', model_name) - # with open('temp.txt', "w") as outfile: - # outfile.write(model_name) - # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) - # with open('temp2.txt', "w") as outfile: - # outfile.write(model_display_name) - # subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) - # break else: time.sleep(SLEEP_INTERVAL) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml index 1269f29..2a054f0 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml @@ -11,7 +11,7 @@ outputs: - {name: model_id, type: String} implementation: container: - image: gcr.io/aju-vtests2/bw-aiplatform:v1 + image: gcr.io/google-samples/bw-aiplatform:v1 command: - sh - -ec @@ -28,7 +28,6 @@ implementation: location, # "us-central1", api_endpoint, #"us-central1-aiplatform.googleapis.com", timeout, # 1800, - # model_id: OutputPath('String') ): import logging diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py index 405531a..27f5fea 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py @@ -80,18 +80,16 @@ def create_endpoint( # project=project, location=location, endpoint=endpoint_id # ) response = client.deploy_model( - # endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split ) logging.info("Long running operation: %s", response.operation.name) deploy_model_response = response.result(timeout=timeout) logging.info("deploy_model_response: %s", deploy_model_response) + # TODO: output status info in some form if __name__ == '__main__': -# deploy_model('aju-vtests2', 'endpoint_test2', -# 'projects/467744782358/locations/us-central1/models/6181278449496227840', 'sdk_test1') import kfp kfp.components.func_to_container_op(deploy_model, output_component_file='../model_deploy_component.yaml', - base_image='gcr.io/aju-vtests2/bw-aiplatform:v1') \ No newline at end of file + base_image='gcr.io/google-samples/bw-aiplatform:v1') diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py index f275713..c90586e 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py @@ -24,7 +24,6 @@ def upload_model( location: str, # "us-central1", api_endpoint: str, #"us-central1-aiplatform.googleapis.com", timeout: int, # 1800, - # model_id: OutputPath('String') ) -> NamedTuple('Outputs', [('model_id', str)]): import logging @@ -64,21 +63,14 @@ def upload_model( logging.info("upload_model_response: %s", upload_model_response) model_path = upload_model_response.model return (model_path, ) - # logging.info('got model path: %s', model_path) - # with open('temp.txt', "w") as outfile: - # outfile.write(model_path) - # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) if __name__ == '__main__': - # upload_model('aju-vtests2', display_name='sdk_test1', metadata_schema_uri="", - # image_uri='us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest', - # artifact_uri='gs://aju-pipelines/v64/077ae97e-9c6d-4c1c-b5a1-fc2e95fb7dbb/0/bwmodel/trained_model/export/bikesw/1615937808') import kfp kfp.components.func_to_container_op(upload_model, output_component_file='../model_upload_component.yaml', - base_image='gcr.io/aju-vtests2/bw-aiplatform:v1') + base_image='gcr.io/google-samples/bw-aiplatform:v1') # gcloud beta ai models upload --region=us-central1 --display-name=bw2 --container-image-uri=us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest --artifact-uri=gs://aju-pipelines/ktune13/f8515c75-32b7-47a4-af70-5ff24362eccc/0/bwmodel/trained_model/export/bikesw/1603733739 \ No newline at end of file diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py index 46756cd..0f17623 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py @@ -32,8 +32,6 @@ def create_training_pipeline_custom_job( api_endpoint: str, # "us-central1-aiplatform.googleapis.com", data_dir: str, hptune_dict: str, - # model_id: OutputPath('String'), - # model_dispname: OutputPath('String') ) -> NamedTuple('Outputs', [('model_id', str), ('model_dispname', str)]): import logging @@ -102,8 +100,6 @@ def create_training_pipeline_custom_job( training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value()) training_task_definition = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" - # image_uri = "gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest" - # image_uri = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' training_pipeline = { "display_name": display_name, @@ -136,31 +132,11 @@ def create_training_pipeline_custom_job( logging.info('training finished') model_name = mresponse.model_to_upload.name return (model_name, model_display_name) - # # write some outputs once finished - # model_name = mresponse.model_to_upload.name - # logging.info('got model name: %s', model_name) - # with open('temp.txt', "w") as outfile: - # outfile.write(model_name) - # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id]) - # with open('temp2.txt', "w") as outfile: - # outfile.write(model_display_name) - # subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname]) - # break else: time.sleep(SLEEP_INTERVAL) if __name__ == '__main__': - # create_training_pipeline_custom_job( - # 'aju-vtests2', 'bw_sdktest2', - # 'bw_sdktest2', - # 'us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest', - # 'gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz', - # 'trainer.task', - # 'gs://aju-pipelines/ucaip/test1803_sdk1', - # "us-central1", - # "us-central1-aiplatform.googleapis.com", - # ) import kfp kfp.components.func_to_container_op(create_training_pipeline_custom_job, output_component_file='../model_train_component.yaml', - base_image='gcr.io/aju-vtests2/bw-aiplatform:v1') + base_image='gcr.io/google-samples/bw-aiplatform:v1') diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/dist/bw-trainer-0.1.tar.gz b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/dist/bw-trainer-0.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..c21c6dd249fa835021155a46e6a7dfded1a9ad9a GIT binary patch literal 4427 zcmV-R5wz|fiwFp~B3fVq|72-%bT49eEp&2WX>Mh5Eif)IE_7jX0PQ{fbJ|F<`B{I( zlv8&SSs@S@9Q%r^BR1J}#b3|H*|*P?(hbr8I_v8+l5H-P|NZrR2m}HgZyaykW~P#W zMm;^0n_?5^GI*!yq0yVl$GetWOo)n2Lgr*EVCdX{28B5A#8*^Td}|9hRz ztLE-*x3$-WYa67kxvcX``rmFf_g5V$`fn-wuk3%-{=*LJDf|E9 z+5d|VKi7}X-k(3a{&#nGH`@PpYgPYu+s*bX)O<<(|3U5lDao*hGhF{jQtHRi0kW;8 zc7{W8fEKc$&F_McXjgd{;&cW8{w<;CujroOY)VqZq5WuraFPVRi!&HvX}`oFsV8^> z7_ZMhoSpxAruFk|8mFt*brNDfSoEIwE{SMq*b$*_>L(eer0Bc|a5j$9aM7c*TKRwF z|G&lm@9lNl9i>MU~XVR{SKO0|{$SN>o5f7QO#|F1LT&+7l-dd2^@ zZT0)_%QU6x%Kt0>ul&FA|H}Vw@c&Eav+r^Khx7i5|F=7xJ>~yjX8-x2x;Q;6%~$^a z<^6xVUG@LA^8YX5|M%AAf4yyW_LV94A^pF2()#TB|FrzC-R^Ev<$tO3pR)fic>V*m zYj$=!O?CcL_P_MQ+VklD_UFH=_P@K=R_8xu|CRlJN&9bi_V?d*+UohAvj6i;i)YjS z&Gz4JfxTa~|82Xa?Eg!&*XS@#W~o1!W~kL{HPO#;JP8Ooj$8}%g8*H!TNGUqO42*x zS=wvuHIkXR9?Bz+<^4_x>L=KR4|&H#GGiFEtR^xTz>d7NQ~MiWn8i5?@eD( zg-_AghndNPOOg!v5pv@&349#61l{}Dlusj(02$~vi6S0lAixL%d;(YFG91DgCz!Qq zmL>F{(YU|Aw=k#HiqlCW5RkNSa(p;AyBgGiVm{_N3J9erCI8O-6nGuYioE6#kQv~6 z6sHJJQUdp~m=S!R`k5b1Ohn^xc8^m6SUjI*sXxlIrPV6P1E$LWuwFPqJN+wke6@q# z^{mmBJe|gzI8y*j?(D@}gJU=@+9v+{c!PR@zKl=@RI6gZv5%GC4 zHhD-=Mn4egvvuTqhpq@&B48W~f@nfqf9$)!S2W4-grG@$M^c{6oscy2Dcem7D{!-l%)Yg-@^&x2v#M*X@?009f|3NI z6MRdjA&zilA!LYUz!~gr67xj};}o_d8pjBa;ymLFVV8*@j%I8XMsa-0He=@qXhTfFJe1%OCc@7n;?f9O_lJ41OMokaToOtZq6LM=ZGnSR$bo7&kmYXhtfIL6T zJyKl8(L`LcndUe7MR98_Hj=x6?-3lCLF_ti8dGW}shgM~elSBnGBXk;rbq4*y=GAm z!OYN2y{4V?-wjSAufXb@Q(Ze6yzgJ14A}zQ+`vKflZkHh2%dm< zZv6P9X&xD)AjY1<#FLWDNN@(y7@25ua4e6NVh7tdXRZJ+#dLzSlx$xg6EQ{Y!NL#&7zqo%0mYX6qpGx9xi;) z-r7CuUk}fR*JlHU-XC)wP_Ku!uyMeJtx5`593aF1jhN=+aX^d(?QiU6jgNZ0W!zf+ zz|@ZkpfN^(lcYitA7$kAHcYl;wb%?ME6iK7J=w%t29j+Q14@!2?+Xc;5N3jA5FZ$@ zw}Omsvvt4Uu9}%}uvVDIoCjS-`hW^yiU*InVeT2wlvBcnOO1rPGTWKb5 zlNi<-2eQjyA)|8_L$sC$IAytu6s9R7GFM$1Z!1H3x^1@1wz+F|%&xhom*}l65a`Q- ztD~n7ImLA9jB@uDEZP!Dt5{7`0u?&`w%II`QH=tghmZ>1$4Qy1(sR+UK)7b|mfe)o zOZ}NUtlvHR9K)LF>|HjU6j(n>ayTMwunD&$#nci8&pKwH2D{mlKyX-?;)L{!nu)w@ z2H$iD3vt$N>DvqiH$#AXMvY}MtULj0ON`H;y!uuG7{I34ySsYrORb9Zc^G5|o@lb2 zKVbz*914d;!V#P*Fbh@qsfC*gv4a!<#O=jS=)A-E{6)7|L=OJ5Fv4rafiTghab zsPM`aI!*SaM63^XZLuGGhvdR>#DW8GpY_=wY$h)b~6R$#_9P%C0=}Gin1;K*^4&PRe2q9xO7tdKJ+;Vu#cB zG6*!*&549waPkGsdQB2v<3TmU!_FoZGCqk|sX{UOJF-QvB(Hyh(SDQ}6$}mI;Z3Vn z=Z;}Bq3m2g^OFxRU=^E5-%p=W+aeKqaFgX#Q z3Uq83H#2ae2UHvRx942uDa^};u~A&M%Y#^iAI!yBAm!*2I0JxRNXx+UPHJ7AUL`4< zqLyW~n1MY7Nqjg71mN6ydPoVZ`h0Y6{;e9Xgaj<9OM5qRkRD?8F(*OZw=Uy2JB29h z>fbp|i8n~oI5ldYN)fuq0#wh#f3&9F|EAvmw&wk>u>Ngj4Su}e|3o>sJDxnZ{%_6y z*j@eouhVQd)$e~V(Qe3O;;^4RK4}@Az!{gt_&vlg*vSRHH#KDml>b-xKY#lCpKf!% z)3IAB|K~^d|IhsPlt=tO%RFAq|KGD!{^uW&|6{BCAC>>}?fE}E?S#}Ndll>guAr^AM5$3Lu((0)k)oexA@qi6PE7fPb^dnBp8i#@#?-ai zAMI+@^B?v6XHEU*UT1$-MS(wF{8zk0@7eYLsr8@i{HN+a)%jo9|Cc=fzuj-Qy6XI| z?EfEL|JhOXKg#|q`~R}`pT++jTiJhQ|DW|G+K<$KvdZIC`)}{5_rI#=zv}t#f9dnz zqRbJe6P`p|TnF)F0vLZ*M}v=plk*Gb z^!#Xmdgwh4fPk}i*M}blyt1%^UURnR$=*~uyFPUW|Jy&kIN{HkSuMdE_78dkN&(=Q z(mW*Wd*}xN6KnE-VNnUyw5#D|e>nL0H<9|e*iGxyXK9wiyUJL$=VWHoG+r3yyz37S ze{rsk|BsQ^-GxaC9IRr+jNTvsi`nqbqFt4&& zm(~sQ7wcHA%3|Pht1OlqV(2mP&d1J!*JkeGz+WdNb z_Hs`y#vKmw#m@!&H-T!Fd5iNJl_V3ivQ5d zsN@5Siq1T8ik~kGS)NFe*qy@O!q%AcT0qD1(*p$ZZ%Qbdk%T&6J{%5T0-q*X9uXMe zW_$$ilyW*odjD9c?}>ACe5rFM0p>NP7K!fsG>#4=hd>ob7Pq@ z{)yyhRtbsjaO!hIKE6xfcs3o!?bE!I?*Q!c3;sN)HI1zD_FK*kAs|E^Ko zWdA2iO`8{RvUO*5)DuP`{#Rc2$Q}F*V_Ih=CIKGN2LCbym?(lHgU~(k%{Jt+QAePyyh> zgw(FWie3(|BCxBFE4TTO*$PQ$27p|4A*8vu9x~Nr();<#{OvR16ygbAH!nHiB6%ODw}6VVvow5R8*Wyt zM|G2a6=52Z(@3`fAspv_ftK^mft*E+Hq5o^#hGM*JzoZ5;e)Vd6`|^(y3HMvivt&p zFcpzsQBKVea5<<-<5HZ`XjF(}lt8p|uwnR0&wA;FRS2(oOxoung!qPNjG=+UhkKjE z;18Vy^U@}e|3&ADy|&yWo2x-tK36tyXMq7Rw9EqRn5&bqVQ#`-b}+*3ZMkHr=#!OB zhx38P3Xb)q(~mWZ?+EZD-q#XG!K{=$@ze$uY7|rUx(z0Yqes{g#_nB=Q;&I3W{rBx z;=p&hZe^cNlPL6b#&BHhwVJ#-xUxO-BoHCn&~|Fv@j*P0&VX{!KlU?YNw+pp1A^>( zq9mK=qLg*$E5>l0?rqSZ)h66XPkd|wTCoznmFrEz=KIvUDG8=o-queWh6+P8wwP#1 zhrz!a{OgJ}2V?9wENjSdm=)F?CuFPa=*(d95T3X|N_4$mmk|aq#LAyp5R4$O#(9wSI<@)KmGLsvwk8((yi>6wI|rz3 zqMeelfu8*?bS;^q9rl0rHtjD(L3F(?LYg{@>iEG-Jl^1{So \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def deploy_model(\n project,\n endpoint_disp_name,\n model_name,\n deployed_model_display_name,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 7200,\n ):\n\n import logging\n from google.cloud import aiplatform\n\n logging.getLogger().setLevel(logging.INFO)\n\n def create_endpoint(\n project,\n display_name,\n client,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 300,\n ):\n\n endpoint = {\"display_name\": display_name}\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_endpoint(parent=parent, endpoint=endpoint)\n print(\"Long running operation:\", response.operation.name)\n create_endpoint_response = response.result(timeout=timeout)\n print(\"create_endpoint_response:\", create_endpoint_response)\n endpoint_name = create_endpoint_response.name\n logging.info('endpoint name: %s', endpoint_name)\n return endpoint_name\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.EndpointServiceClient(client_options=client_options)\n\n # create endpoint\n logging.info('creating endpoint %s', endpoint_disp_name)\n endpoint_path = create_endpoint(project, endpoint_disp_name, client)\n logging.info(\"using endpoint path ID %s\", endpoint_path)\n\n deployed_model = {\n # format: 'projects/{project}/locations/{location}/models/{model}'\n \"model\": model_name,\n \"display_name\": deployed_model_display_name,\n # `dedicated_resources` must be used for non-AutoML models\n \"dedicated_resources\": {\n \"min_replica_count\": 1,\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-2\",\n # Accelerators can be used only if the model specifies a GPU image.\n # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n # 'accelerator_count': 1,\n },\n },\n }\n # key '0' assigns traffic for the newly deployed model\n # Traffic percentage values must add up to 100\n # Leave dictionary empty if endpoint should not accept any traffic\n traffic_split = {\"0\": 100}\n# endpoint = client.endpoint_path(\n# project=project, location=location, endpoint=endpoint_id\n# )\n response = client.deploy_model(\n endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split\n )\n logging.info(\"Long running operation: %s\", response.operation.name)\n deploy_model_response = response.result(timeout=timeout)\n logging.info(\"deploy_model_response: %s\", deploy_model_response)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Deploy model', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--endpoint-disp-name\", dest=\"endpoint_disp_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--deployed-model-display-name\", dest=\"deployed_model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--timeout\", dest=\"timeout\", type=int, required=False, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = deploy_model(**_parsed_args)\n" + ] + } + }, + "Create training pipeline custom job": { + "container": { + "command": [ + "sh", + "-ec", + "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def create_training_pipeline_custom_job(\n project,\n display_name,\n model_display_name,\n train_container_type,\n executor_image_uri,\n package_uri,\n python_module,\n container_image_uri,\n base_output_directory_prefix,\n prediction_image_uri, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'\n location, # \"us-central1\"\n api_endpoint, # \"us-central1-aiplatform.googleapis.com\",\n data_dir,\n hptune_dict,\n):\n\n import logging\n import subprocess\n import time\n\n from google.cloud import aiplatform\n from google.protobuf import json_format\n from google.protobuf.struct_pb2 import Value\n from google.cloud.aiplatform_v1beta1.types import pipeline_state\n\n logging.getLogger().setLevel(logging.INFO)\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.PipelineServiceClient(client_options=client_options)\n\n if train_container_type == 'prebuilt':\n python_package_spec = {\n \"executor_image_uri\": executor_image_uri,\n \"package_uris\": [package_uri],\n \"python_module\": python_module,\n \"args\": [f\"--data-dir={data_dir}\",\n f\"--hptune-dict={hptune_dict}\"]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"python_package_spec\": python_package_spec,\n }\n elif train_container_type == 'custom':\n container_spec = {\n # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job\n \"imageUri\": container_image_uri,\n \"args\": [\n # AIP_MODEL_DIR is set by the service according to baseOutputDirectory.\n \"--model_dir=$(AIP_MODEL_DIR)\",\n ]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"container_spec\": container_spec,\n }\n else:\n logging.warning('unknown train_container_type; exiting')\n exit(1)\n\n training_task_inputs_dict = {\n \"workerPoolSpecs\": [\n worker_pool_spec\n ],\n \"baseOutputDirectory\": {\n # The GCS location for outputs must be accessible by the project's AI Platform service account.\n \"output_uri_prefix\": base_output_directory_prefix\n },\n }\n training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value())\n\n training_task_definition = \"gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml\"\n\n training_pipeline = {\n \"display_name\": display_name,\n \"training_task_definition\": training_task_definition,\n \"training_task_inputs\": training_task_inputs,\n \"model_to_upload\": {\n \"display_name\": model_display_name,\n \"container_spec\": {\"image_uri\": prediction_image_uri},\n },\n }\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_training_pipeline(\n parent=parent, training_pipeline=training_pipeline\n )\n logging.info(\"training pipeline request response: %s\", response)\n\n SLEEP_INTERVAL = 100\n\n training_pipeline_name = response.name\n logging.info(\"training pipeline name: %s\", training_pipeline_name)\n # Poll periodically until training completes\n while True:\n mresponse = client.get_training_pipeline(name=training_pipeline_name)\n logging.info('mresponse: %s', mresponse)\n logging.info('job state: %s', mresponse.state)\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED:\n logging.warning('training pipeline failed: %s', mresponse)\n exit(1)\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED:\n logging.info('training finished')\n model_name = mresponse.model_to_upload.name\n return (model_name, model_display_name)\n else:\n time.sleep(SLEEP_INTERVAL)\n\ndef _serialize_str(str_value: str) -> str:\n if not isinstance(str_value, str):\n raise TypeError('Value \"{}\" has type \"{}\" instead of str.'.format(str(str_value), str(type(str_value))))\n return str_value\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Create training pipeline custom job', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--display-name\", dest=\"display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-display-name\", dest=\"model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-container-type\", dest=\"train_container_type\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--executor-image-uri\", dest=\"executor_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--package-uri\", dest=\"package_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--python-module\", dest=\"python_module\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--container-image-uri\", dest=\"container_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--base-output-directory-prefix\", dest=\"base_output_directory_prefix\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--prediction-image-uri\", dest=\"prediction_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--data-dir\", dest=\"data_dir\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--hptune-dict\", dest=\"hptune_dict\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=2)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = create_training_pipeline_custom_job(**_parsed_args)\n\n_output_serializers = [\n _serialize_str,\n _serialize_str,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, 'w') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n" + ], + "args": [ + "--project", + "{{$.inputs.parameters['project']}}", + "--display-name", + "{{$.inputs.parameters['display_name']}}", + "--model-display-name", + "{{$.inputs.parameters['model_display_name']}}", + "--train-container-type", + "{{$.inputs.parameters['train_container_type']}}", + "--executor-image-uri", + "{{$.inputs.parameters['executor_image_uri']}}", + "--package-uri", + "{{$.inputs.parameters['package_uri']}}", + "--python-module", + "{{$.inputs.parameters['python_module']}}", + "--container-image-uri", + "{{$.inputs.parameters['container_image_uri']}}", + "--base-output-directory-prefix", + "{{$.inputs.parameters['base_output_directory_prefix']}}", + "--prediction-image-uri", + "{{$.inputs.parameters['prediction_image_uri']}}", + "--location", + "{{$.inputs.parameters['location']}}", + "--api-endpoint", + "{{$.inputs.parameters['api_endpoint']}}", + "--data-dir", + "{{$.inputs.parameters['data_dir']}}", + "--hptune-dict", + "{{$.inputs.parameters['hptune_dict']}}", + "----output-paths", + "{{$.outputs.parameters['model_id'].output_file}}", + "{{$.outputs.parameters['model_dispname'].output_file}}" + ], + "image": "gcr.io/google-samples/bw-aiplatform:v1" + } + } + } + }, "runtimeParameters": { - "train_container_type": { + "executor_image_uri": { "defaultValue": { - "stringValue": "prebuilt" + "stringValue": "us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest" }, "type": "STRING" }, - "hptune_dict": { - "type": "STRING", + "model_display_name": { "defaultValue": { - "stringValue": "{\"num_hidden_layers\": 3, \"hidden_size\": 32, \"learning_rate\": 0.01, \"epochs\": 3, \"steps_per_epoch\": -1}" - } + "stringValue": "CHANGE THIS" + }, + "type": "STRING" }, "timeout": { "type": "INT", @@ -19,86 +96,88 @@ "intValue": "7200" } }, - "project": { - "type": "STRING", - "defaultValue": { - "stringValue": "aju-vtests2" - } - }, - "python_module": { + "training_display_name": { "type": "STRING", "defaultValue": { - "stringValue": "trainer.task" + "stringValue": "CHANGE THIS" } }, - "package_uri": { + "project": { "defaultValue": { - "stringValue": "gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz" + "stringValue": "aju-vtests2" }, "type": "STRING" }, - "container_image_uri": { + "endpoint_disp_name": { + "defaultValue": { + "stringValue": "CHANGE THIS" + }, "type": "STRING" }, - "prediction_image_uri": { + "python_module": { "defaultValue": { - "stringValue": "us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest" + "stringValue": "trainer.task" }, "type": "STRING" }, - "executor_image_uri": { + "api_endpoint": { "type": "STRING", "defaultValue": { - "stringValue": "us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest" + "stringValue": "us-central1-aiplatform.googleapis.com" } }, - "endpoint_disp_name": { + "package_uri": { "defaultValue": { - "stringValue": "CHANGE THIS" + "stringValue": "gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz" }, "type": "STRING" }, - "base_output_directory_prefix": { + "hptune_dict": { + "type": "STRING", "defaultValue": { - "stringValue": "gs://aju-pipelines/ucaip/training2/" + "stringValue": "{\"num_hidden_layers\": 3, \"hidden_size\": 32, \"learning_rate\": 0.01, \"epochs\": 3, \"steps_per_epoch\": -1}" + } + }, + "location": { + "defaultValue": { + "stringValue": "us-central1" }, "type": "STRING" }, - "model_display_name": { + "base_output_directory_prefix": { "type": "STRING", "defaultValue": { - "stringValue": "CHANGE THIS" + "stringValue": "gs://aju-pipelines/ucaip/training2/" } }, - "api_endpoint": { - "type": "STRING", - "defaultValue": { - "stringValue": "us-central1-aiplatform.googleapis.com" - } + "container_image_uri": { + "type": "STRING" }, "data_dir": { - "type": "STRING", "defaultValue": { "stringValue": "gs://aju-dev-demos-codelabs/bikes_weather/" - } + }, + "type": "STRING" }, - "location": { + "prediction_image_uri": { "type": "STRING", "defaultValue": { - "stringValue": "us-central1" + "stringValue": "us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest" } }, - "training_display_name": { + "train_container_type": { + "type": "STRING", "defaultValue": { - "stringValue": "CHANGE THIS" - }, - "type": "STRING" + "stringValue": "prebuilt" + } } }, - "sdkVersion": "kfp-1.4.0", "schemaVersion": "v2alpha1", "tasks": [ { + "taskInfo": { + "name": "Create training pipeline custom job" + }, "outputs": { "parameters": { "model_id": { @@ -109,24 +188,31 @@ } } }, - "taskInfo": { - "name": "Create training pipeline custom job" - }, "inputs": { "parameters": { - "project": { + "base_output_directory_prefix": { "runtimeValue": { - "runtimeParameter": "project" + "runtimeParameter": "base_output_directory_prefix" } }, - "base_output_directory_prefix": { + "hptune_dict": { "runtimeValue": { - "runtimeParameter": "base_output_directory_prefix" + "runtimeParameter": "hptune_dict" } }, - "python_module": { + "prediction_image_uri": { "runtimeValue": { - "runtimeParameter": "python_module" + "runtimeParameter": "prediction_image_uri" + } + }, + "executor_image_uri": { + "runtimeValue": { + "runtimeParameter": "executor_image_uri" + } + }, + "train_container_type": { + "runtimeValue": { + "runtimeParameter": "train_container_type" } }, "container_image_uri": { @@ -134,9 +220,9 @@ "runtimeParameter": "container_image_uri" } }, - "display_name": { + "package_uri": { "runtimeValue": { - "runtimeParameter": "training_display_name" + "runtimeParameter": "package_uri" } }, "location": { @@ -144,24 +230,19 @@ "runtimeParameter": "location" } }, - "api_endpoint": { - "runtimeValue": { - "runtimeParameter": "api_endpoint" - } - }, - "hptune_dict": { + "python_module": { "runtimeValue": { - "runtimeParameter": "hptune_dict" + "runtimeParameter": "python_module" } }, - "train_container_type": { + "api_endpoint": { "runtimeValue": { - "runtimeParameter": "train_container_type" + "runtimeParameter": "api_endpoint" } }, - "executor_image_uri": { + "project": { "runtimeValue": { - "runtimeParameter": "executor_image_uri" + "runtimeParameter": "project" } }, "model_display_name": { @@ -169,37 +250,41 @@ "runtimeParameter": "model_display_name" } }, - "prediction_image_uri": { + "display_name": { "runtimeValue": { - "runtimeParameter": "prediction_image_uri" + "runtimeParameter": "training_display_name" } }, "data_dir": { "runtimeValue": { "runtimeParameter": "data_dir" } - }, - "package_uri": { - "runtimeValue": { - "runtimeParameter": "package_uri" - } } } }, "executorLabel": "Create training pipeline custom job" }, { + "taskInfo": { + "name": "Deploy model" + }, "executorLabel": "Deploy model", "inputs": { "parameters": { + "model_name": { + "taskOutputParameter": { + "producerTask": "Create training pipeline custom job", + "outputParameterKey": "model_id" + } + }, "endpoint_disp_name": { "runtimeValue": { "runtimeParameter": "endpoint_disp_name" } }, - "deployed_model_display_name": { + "api_endpoint": { "runtimeValue": { - "runtimeParameter": "model_display_name" + "runtimeParameter": "api_endpoint" } }, "timeout": { @@ -207,110 +292,25 @@ "runtimeParameter": "timeout" } }, - "api_endpoint": { + "location": { "runtimeValue": { - "runtimeParameter": "api_endpoint" + "runtimeParameter": "location" } }, - "project": { + "deployed_model_display_name": { "runtimeValue": { - "runtimeParameter": "project" + "runtimeParameter": "model_display_name" } }, - "location": { + "project": { "runtimeValue": { - "runtimeParameter": "location" - } - }, - "model_name": { - "taskOutputParameter": { - "outputParameterKey": "model_id", - "producerTask": "Create training pipeline custom job" + "runtimeParameter": "project" } } } - }, - "taskInfo": { - "name": "Deploy model" } } ], - "deploymentConfig": { - "executors": { - "Create training pipeline custom job": { - "container": { - "command": [ - "sh", - "-ec", - "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", - "def create_training_pipeline_custom_job(\n project,\n display_name,\n model_display_name,\n train_container_type,\n executor_image_uri,\n package_uri,\n python_module,\n container_image_uri,\n base_output_directory_prefix,\n prediction_image_uri, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'\n location, # \"us-central1\"\n api_endpoint, # \"us-central1-aiplatform.googleapis.com\",\n data_dir,\n hptune_dict,\n # model_id: OutputPath('String'),\n # model_dispname: OutputPath('String')\n):\n\n import logging\n import subprocess\n import time\n\n from google.cloud import aiplatform\n from google.protobuf import json_format\n from google.protobuf.struct_pb2 import Value\n from google.cloud.aiplatform_v1beta1.types import pipeline_state\n\n logging.getLogger().setLevel(logging.INFO)\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.PipelineServiceClient(client_options=client_options)\n\n if train_container_type == 'prebuilt':\n python_package_spec = {\n \"executor_image_uri\": executor_image_uri,\n \"package_uris\": [package_uri],\n \"python_module\": python_module,\n \"args\": [f\"--data-dir={data_dir}\",\n f\"--hptune-dict={hptune_dict}\"]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"python_package_spec\": python_package_spec,\n }\n elif train_container_type == 'custom':\n container_spec = {\n # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job\n \"imageUri\": container_image_uri,\n \"args\": [\n # AIP_MODEL_DIR is set by the service according to baseOutputDirectory.\n \"--model_dir=$(AIP_MODEL_DIR)\",\n ]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"container_spec\": container_spec,\n }\n\n training_task_inputs_dict = {\n \"workerPoolSpecs\": [\n worker_pool_spec\n ],\n \"baseOutputDirectory\": {\n # The GCS location for outputs must be accessible by the project's AI Platform service account.\n \"output_uri_prefix\": base_output_directory_prefix\n },\n }\n training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value())\n\n training_task_definition = \"gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml\"\n # image_uri = \"gcr.io/cloud-aiplatform/prediction/tf-cpu.1-15:latest\"\n # image_uri = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'\n\n training_pipeline = {\n \"display_name\": display_name,\n \"training_task_definition\": training_task_definition,\n \"training_task_inputs\": training_task_inputs,\n \"model_to_upload\": {\n \"display_name\": model_display_name,\n \"container_spec\": {\"image_uri\": prediction_image_uri},\n },\n }\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_training_pipeline(\n parent=parent, training_pipeline=training_pipeline\n )\n logging.info(\"training pipeline request response: %s\", response)\n\n SLEEP_INTERVAL = 100\n\n training_pipeline_name = response.name\n logging.info(\"training pipeline name: %s\", training_pipeline_name)\n # Poll periodically until training completes\n while True:\n mresponse = client.get_training_pipeline(name=training_pipeline_name)\n logging.info('mresponse: %s', mresponse)\n logging.info('job state: %s', mresponse.state)\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED:\n logging.warning('training pipeline failed: %s', mresponse)\n break\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED:\n logging.info('training finished')\n model_name = mresponse.model_to_upload.name\n return (model_name, model_display_name)\n # # write some outputs once finished\n # model_name = mresponse.model_to_upload.name\n # logging.info('got model name: %s', model_name)\n # with open('temp.txt', \"w\") as outfile:\n # outfile.write(model_name)\n # subprocess.run(['gsutil', 'cp', 'temp.txt', model_id])\n # with open('temp2.txt', \"w\") as outfile:\n # outfile.write(model_display_name)\n # subprocess.run(['gsutil', 'cp', 'temp2.txt', model_dispname])\n # break\n else:\n time.sleep(SLEEP_INTERVAL)\n\ndef _serialize_str(str_value: str) -> str:\n if not isinstance(str_value, str):\n raise TypeError('Value \"{}\" has type \"{}\" instead of str.'.format(str(str_value), str(type(str_value))))\n return str_value\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Create training pipeline custom job', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--display-name\", dest=\"display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-display-name\", dest=\"model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-container-type\", dest=\"train_container_type\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--executor-image-uri\", dest=\"executor_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--package-uri\", dest=\"package_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--python-module\", dest=\"python_module\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--container-image-uri\", dest=\"container_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--base-output-directory-prefix\", dest=\"base_output_directory_prefix\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--prediction-image-uri\", dest=\"prediction_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--data-dir\", dest=\"data_dir\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--hptune-dict\", dest=\"hptune_dict\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=2)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = create_training_pipeline_custom_job(**_parsed_args)\n\n_output_serializers = [\n _serialize_str,\n _serialize_str,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, 'w') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n" - ], - "image": "gcr.io/aju-vtests2/bw-aiplatform:v1", - "args": [ - "--project", - "{{$.inputs.parameters['project']}}", - "--display-name", - "{{$.inputs.parameters['display_name']}}", - "--model-display-name", - "{{$.inputs.parameters['model_display_name']}}", - "--train-container-type", - "{{$.inputs.parameters['train_container_type']}}", - "--executor-image-uri", - "{{$.inputs.parameters['executor_image_uri']}}", - "--package-uri", - "{{$.inputs.parameters['package_uri']}}", - "--python-module", - "{{$.inputs.parameters['python_module']}}", - "--container-image-uri", - "{{$.inputs.parameters['container_image_uri']}}", - "--base-output-directory-prefix", - "{{$.inputs.parameters['base_output_directory_prefix']}}", - "--prediction-image-uri", - "{{$.inputs.parameters['prediction_image_uri']}}", - "--location", - "{{$.inputs.parameters['location']}}", - "--api-endpoint", - "{{$.inputs.parameters['api_endpoint']}}", - "--data-dir", - "{{$.inputs.parameters['data_dir']}}", - "--hptune-dict", - "{{$.inputs.parameters['hptune_dict']}}", - "----output-paths", - "{{$.outputs.parameters['model_id'].output_file}}", - "{{$.outputs.parameters['model_dispname'].output_file}}" - ] - } - }, - "Deploy model": { - "container": { - "args": [ - "--project", - "{{$.inputs.parameters['project']}}", - "--endpoint-disp-name", - "{{$.inputs.parameters['endpoint_disp_name']}}", - "--model-name", - "{{$.inputs.parameters['model_name']}}", - "--deployed-model-display-name", - "{{$.inputs.parameters['deployed_model_display_name']}}", - "--location", - "{{$.inputs.parameters['location']}}", - "--api-endpoint", - "{{$.inputs.parameters['api_endpoint']}}", - "--timeout", - "{{$.inputs.parameters['timeout']}}" - ], - "command": [ - "sh", - "-ec", - "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", - "def deploy_model(\n project,\n endpoint_disp_name,\n model_name,\n deployed_model_display_name,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 7200,\n ):\n\n import logging\n from google.cloud import aiplatform\n\n logging.getLogger().setLevel(logging.INFO)\n\n def create_endpoint(\n project,\n display_name,\n client,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 300,\n ):\n\n endpoint = {\"display_name\": display_name}\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_endpoint(parent=parent, endpoint=endpoint)\n print(\"Long running operation:\", response.operation.name)\n create_endpoint_response = response.result(timeout=timeout)\n print(\"create_endpoint_response:\", create_endpoint_response)\n endpoint_name = create_endpoint_response.name\n logging.info('endpoint name: %s', endpoint_name)\n return endpoint_name\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.EndpointServiceClient(client_options=client_options)\n\n # create endpoint\n logging.info('creating endpoint %s', endpoint_disp_name)\n endpoint_path = create_endpoint(project, endpoint_disp_name, client)\n logging.info(\"using endpoint path ID %s\", endpoint_path)\n\n deployed_model = {\n # format: 'projects/{project}/locations/{location}/models/{model}'\n \"model\": model_name,\n \"display_name\": deployed_model_display_name,\n # `dedicated_resources` must be used for non-AutoML models\n \"dedicated_resources\": {\n \"min_replica_count\": 1,\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-2\",\n # Accelerators can be used only if the model specifies a GPU image.\n # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n # 'accelerator_count': 1,\n },\n },\n }\n # key '0' assigns traffic for the newly deployed model\n # Traffic percentage values must add up to 100\n # Leave dictionary empty if endpoint should not accept any traffic\n traffic_split = {\"0\": 100}\n# endpoint = client.endpoint_path(\n# project=project, location=location, endpoint=endpoint_id\n# )\n response = client.deploy_model(\n # endpoint=endpoint, deployed_model=deployed_model, traffic_split=traffic_split\n endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split\n )\n logging.info(\"Long running operation: %s\", response.operation.name)\n deploy_model_response = response.result(timeout=timeout)\n logging.info(\"deploy_model_response: %s\", deploy_model_response)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Deploy model', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--endpoint-disp-name\", dest=\"endpoint_disp_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--deployed-model-display-name\", dest=\"deployed_model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--timeout\", dest=\"timeout\", type=int, required=False, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = deploy_model(**_parsed_args)\n" - ], - "image": "gcr.io/aju-vtests2/bw-aiplatform:v1" - } - } - }, - "@type": "type.googleapis.com/ml_pipelines.PipelineDeploymentConfig" - }, "pipelineInfo": { "name": "ucaip-model-train" } From 016b998a5cec7b8772c0a6a180167d63696a1c07 Mon Sep 17 00:00:00 2001 From: Amy Unruh Date: Wed, 24 Mar 2021 14:45:00 -0700 Subject: [PATCH 4/4] minor cleanup --- .../keras_tuner/components/ucaip/bw_ucaip_train_pl.py | 9 ++++++--- .../components/ucaip/training/create_training_job.py | 9 +++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py index cfdf41c..5907fa3 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py @@ -34,16 +34,19 @@ def model_train_pipeline( prediction_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest', location: str = "us-central1", api_endpoint: str = "us-central1-aiplatform.googleapis.com", - project: str = 'aju-vtests2', + # project: str = 'aju-vtests2', + project: str = 'CHANGE THIS', training_display_name: str = 'CHANGE THIS', model_display_name: str = 'CHANGE THIS', endpoint_disp_name: str = 'CHANGE THIS', train_container_type: str = 'prebuilt', executor_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest', - package_uri: str = 'gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz', + # package_uri: str = 'gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz', + package_uri: str = 'CHANGE THIS', # TODO: upload publicly readable version python_module: str = 'trainer.task', container_image_uri: str = '', - base_output_directory_prefix: str = 'gs://aju-pipelines/ucaip/training2/', + # base_output_directory_prefix: str = 'gs://aju-pipelines/ucaip/training2/', + base_output_directory_prefix: str = 'CHANGE THIS', timeout: int = 7200, hptune_dict: str = '{"num_hidden_layers": 3, "hidden_size": 32, "learning_rate": 0.01, "epochs": 3, "steps_per_epoch": -1}', data_dir: str = 'gs://aju-dev-demos-codelabs/bikes_weather/' diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py index 0f17623..2c004a2 100644 --- a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - -# from kfp.components import OutputPath from typing import NamedTuple @@ -51,7 +49,11 @@ def create_training_pipeline_custom_job( # This client only needs to be created once, and can be reused for multiple requests. client = aiplatform.gapic.PipelineServiceClient(client_options=client_options) + # TODO: more error checking before kicking off the job if train_container_type == 'prebuilt': + if package_uri == 'none' or executor_image_uri == 'none': + logging.warning('unspecified URI; exiting') + exit(1) python_package_spec = { "executor_image_uri": executor_image_uri, "package_uris": [package_uri], @@ -68,6 +70,9 @@ def create_training_pipeline_custom_job( "python_package_spec": python_package_spec, } elif train_container_type == 'custom': + if container_image_uri == 'none': + logging.warning('unspecified container_image_uri; exiting') + exit(1) container_spec = { # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job "imageUri": container_image_uri,