diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py new file mode 100644 index 0000000..5907fa3 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/bw_ucaip_train_pl.py @@ -0,0 +1,80 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from kfp.v2 import dsl +from kfp.v2 import compiler +from kfp import components + +model_train_op = components.load_component_from_file( + './model_train_component.yaml' + ) + +model_deploy_op = components.load_component_from_file( + './model_deploy_component.yaml' + ) + + +@dsl.pipeline( + name='ucaip-model-train', + description='ucaip model train' +) +def model_train_pipeline( + prediction_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest', + location: str = "us-central1", + api_endpoint: str = "us-central1-aiplatform.googleapis.com", + # project: str = 'aju-vtests2', + project: str = 'CHANGE THIS', + training_display_name: str = 'CHANGE THIS', + model_display_name: str = 'CHANGE THIS', + endpoint_disp_name: str = 'CHANGE THIS', + train_container_type: str = 'prebuilt', + executor_image_uri: str = 'us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest', + # package_uri: str = 'gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz', + package_uri: str = 'CHANGE THIS', # TODO: upload publicly readable version + python_module: str = 'trainer.task', + container_image_uri: str = '', + # base_output_directory_prefix: str = 'gs://aju-pipelines/ucaip/training2/', + base_output_directory_prefix: str = 'CHANGE THIS', + timeout: int = 7200, + hptune_dict: str = '{"num_hidden_layers": 3, "hidden_size": 32, "learning_rate": 0.01, "epochs": 3, "steps_per_epoch": -1}', + data_dir: str = 'gs://aju-dev-demos-codelabs/bikes_weather/' + ): + + model_train = model_train_op( + project, training_display_name, model_display_name, + train_container_type, + executor_image_uri, package_uri, python_module, + container_image_uri, + base_output_directory_prefix, + prediction_image_uri, + location, api_endpoint, + data_dir, + hptune_dict + ) + + model_deploy = model_deploy_op( + project, endpoint_disp_name, + model_train.outputs['model_id'], + model_display_name, + location, api_endpoint, timeout + ) + + + +if __name__ == '__main__': + PIPELINE_ROOT = 'gs://aju-pipelines/pipeline_root/ucaiptests' + compiler.Compiler().compile(pipeline_func=model_train_pipeline, + pipeline_root=PIPELINE_ROOT, + output_path='ucaip_train_pipeline_spec.json') diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml new file mode 100644 index 0000000..8761f8c --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_deploy_component.yaml @@ -0,0 +1,131 @@ +name: Deploy model +inputs: +- {name: project, type: String} +- {name: endpoint_disp_name, type: String} +- {name: model_name, type: String} +- {name: deployed_model_display_name, type: String} +- {name: location, type: String, default: us-central1, optional: true} +- {name: api_endpoint, type: String, default: us-central1-aiplatform.googleapis.com, + optional: true} +- {name: timeout, type: Integer, default: '7200', optional: true} +implementation: + container: + image: gcr.io/google-samples/bw-aiplatform:v1 + command: + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def deploy_model( + project, + endpoint_disp_name, + model_name, + deployed_model_display_name, + location = "us-central1", + api_endpoint = "us-central1-aiplatform.googleapis.com", + timeout = 7200, + ): + + import logging + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + + def create_endpoint( + project, + display_name, + client, + location = "us-central1", + api_endpoint = "us-central1-aiplatform.googleapis.com", + timeout = 300, + ): + + endpoint = {"display_name": display_name} + parent = f"projects/{project}/locations/{location}" + response = client.create_endpoint(parent=parent, endpoint=endpoint) + print("Long running operation:", response.operation.name) + create_endpoint_response = response.result(timeout=timeout) + print("create_endpoint_response:", create_endpoint_response) + endpoint_name = create_endpoint_response.name + logging.info('endpoint name: %s', endpoint_name) + return endpoint_name + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.EndpointServiceClient(client_options=client_options) + + # create endpoint + logging.info('creating endpoint %s', endpoint_disp_name) + endpoint_path = create_endpoint(project, endpoint_disp_name, client) + logging.info("using endpoint path ID %s", endpoint_path) + + deployed_model = { + # format: 'projects/{project}/locations/{location}/models/{model}' + "model": model_name, + "display_name": deployed_model_display_name, + # `dedicated_resources` must be used for non-AutoML models + "dedicated_resources": { + "min_replica_count": 1, + "machine_spec": { + "machine_type": "n1-standard-2", + # Accelerators can be used only if the model specifies a GPU image. + # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + # 'accelerator_count': 1, + }, + }, + } + # key '0' assigns traffic for the newly deployed model + # Traffic percentage values must add up to 100 + # Leave dictionary empty if endpoint should not accept any traffic + traffic_split = {"0": 100} + # endpoint = client.endpoint_path( + # project=project, location=location, endpoint=endpoint_id + # ) + response = client.deploy_model( + endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split + ) + logging.info("Long running operation: %s", response.operation.name) + deploy_model_response = response.result(timeout=timeout) + logging.info("deploy_model_response: %s", deploy_model_response) + + import argparse + _parser = argparse.ArgumentParser(prog='Deploy model', description='') + _parser.add_argument("--project", dest="project", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--endpoint-disp-name", dest="endpoint_disp_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-name", dest="model_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--deployed-model-display-name", dest="deployed_model_display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--timeout", dest="timeout", type=int, required=False, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + + _outputs = deploy_model(**_parsed_args) + args: + - --project + - {inputValue: project} + - --endpoint-disp-name + - {inputValue: endpoint_disp_name} + - --model-name + - {inputValue: model_name} + - --deployed-model-display-name + - {inputValue: deployed_model_display_name} + - if: + cond: {isPresent: location} + then: + - --location + - {inputValue: location} + - if: + cond: {isPresent: api_endpoint} + then: + - --api-endpoint + - {inputValue: api_endpoint} + - if: + cond: {isPresent: timeout} + then: + - --timeout + - {inputValue: timeout} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml new file mode 100644 index 0000000..eece276 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_train_component.yaml @@ -0,0 +1,221 @@ +name: Create training pipeline custom job +inputs: +- {name: project, type: String} +- {name: display_name, type: String} +- {name: model_display_name, type: String} +- {name: train_container_type, type: String} +- {name: executor_image_uri, type: String} +- {name: package_uri, type: String} +- {name: python_module, type: String} +- {name: container_image_uri, type: String} +- {name: base_output_directory_prefix, type: String} +- {name: prediction_image_uri, type: String} +- {name: location, type: String} +- {name: api_endpoint, type: String} +- {name: data_dir, type: String} +- {name: hptune_dict, type: String} +outputs: +- {name: model_id, type: String} +- {name: model_dispname, type: String} +implementation: + container: + image: gcr.io/google-samples/bw-aiplatform:v1 + command: + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def create_training_pipeline_custom_job( + project, + display_name, + model_display_name, + train_container_type, + executor_image_uri, + package_uri, + python_module, + container_image_uri, + base_output_directory_prefix, + prediction_image_uri, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' + location, # "us-central1" + api_endpoint, # "us-central1-aiplatform.googleapis.com", + data_dir, + hptune_dict, + ): + + import logging + import subprocess + import time + + from google.cloud import aiplatform + from google.protobuf import json_format + from google.protobuf.struct_pb2 import Value + from google.cloud.aiplatform_v1beta1.types import pipeline_state + + logging.getLogger().setLevel(logging.INFO) + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.PipelineServiceClient(client_options=client_options) + + if train_container_type == 'prebuilt': + python_package_spec = { + "executor_image_uri": executor_image_uri, + "package_uris": [package_uri], + "python_module": python_module, + "args": [f"--data-dir={data_dir}", + f"--hptune-dict={hptune_dict}"]} + worker_pool_spec = { + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "python_package_spec": python_package_spec, + } + elif train_container_type == 'custom': + container_spec = { + # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job + "imageUri": container_image_uri, + "args": [ + # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. + "--model_dir=$(AIP_MODEL_DIR)", + ]} + worker_pool_spec = { + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "container_spec": container_spec, + } + else: + logging.warning('unknown train_container_type; exiting') + exit(1) + + training_task_inputs_dict = { + "workerPoolSpecs": [ + worker_pool_spec + ], + "baseOutputDirectory": { + # The GCS location for outputs must be accessible by the project's AI Platform service account. + "output_uri_prefix": base_output_directory_prefix + }, + } + training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value()) + + training_task_definition = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" + + training_pipeline = { + "display_name": display_name, + "training_task_definition": training_task_definition, + "training_task_inputs": training_task_inputs, + "model_to_upload": { + "display_name": model_display_name, + "container_spec": {"image_uri": prediction_image_uri}, + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.create_training_pipeline( + parent=parent, training_pipeline=training_pipeline + ) + logging.info("training pipeline request response: %s", response) + + SLEEP_INTERVAL = 100 + + training_pipeline_name = response.name + logging.info("training pipeline name: %s", training_pipeline_name) + # Poll periodically until training completes + while True: + mresponse = client.get_training_pipeline(name=training_pipeline_name) + logging.info('mresponse: %s', mresponse) + logging.info('job state: %s', mresponse.state) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED: + logging.warning('training pipeline failed: %s', mresponse) + exit(1) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED: + logging.info('training finished') + model_name = mresponse.model_to_upload.name + return (model_name, model_display_name) + else: + time.sleep(SLEEP_INTERVAL) + + def _serialize_str(str_value: str) -> str: + if not isinstance(str_value, str): + raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value)))) + return str_value + + import argparse + _parser = argparse.ArgumentParser(prog='Create training pipeline custom job', description='') + _parser.add_argument("--project", dest="project", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-display-name", dest="model_display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--train-container-type", dest="train_container_type", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--executor-image-uri", dest="executor_image_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--package-uri", dest="package_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--python-module", dest="python_module", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--container-image-uri", dest="container_image_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--base-output-directory-prefix", dest="base_output_directory_prefix", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--prediction-image-uri", dest="prediction_image_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--data-dir", dest="data_dir", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--hptune-dict", dest="hptune_dict", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = create_training_pipeline_custom_job(**_parsed_args) + + _output_serializers = [ + _serialize_str, + _serialize_str, + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --project + - {inputValue: project} + - --display-name + - {inputValue: display_name} + - --model-display-name + - {inputValue: model_display_name} + - --train-container-type + - {inputValue: train_container_type} + - --executor-image-uri + - {inputValue: executor_image_uri} + - --package-uri + - {inputValue: package_uri} + - --python-module + - {inputValue: python_module} + - --container-image-uri + - {inputValue: container_image_uri} + - --base-output-directory-prefix + - {inputValue: base_output_directory_prefix} + - --prediction-image-uri + - {inputValue: prediction_image_uri} + - --location + - {inputValue: location} + - --api-endpoint + - {inputValue: api_endpoint} + - --data-dir + - {inputValue: data_dir} + - --hptune-dict + - {inputValue: hptune_dict} + - '----output-paths' + - {outputPath: model_id} + - {outputPath: model_dispname} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml new file mode 100644 index 0000000..2a054f0 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/model_upload_component.yaml @@ -0,0 +1,120 @@ +name: Upload model +inputs: +- {name: project, type: String} +- {name: display_name, type: String} +- {name: image_uri, type: String} +- {name: artifact_uri, type: String} +- {name: location, type: String} +- {name: api_endpoint, type: String} +- {name: timeout, type: Integer} +outputs: +- {name: model_id, type: String} +implementation: + container: + image: gcr.io/google-samples/bw-aiplatform:v1 + command: + - sh + - -ec + - | + program_path=$(mktemp) + printf "%s" "$0" > "$program_path" + python3 -u "$program_path" "$@" + - | + def upload_model( + project, + display_name, + image_uri, + artifact_uri, + location, # "us-central1", + api_endpoint, #"us-central1-aiplatform.googleapis.com", + timeout, # 1800, + ): + + import logging + import subprocess + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + metadata_schema_uri = "" + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.ModelServiceClient(client_options=client_options) + model = { + "display_name": display_name, + "metadata_schema_uri": metadata_schema_uri, + # The artifact_uri should be the path to a GCS directory containing + # saved model artifacts. The bucket must be accessible for the + # project's AI Platform service account and in the same region as + # the api endpoint. + "artifact_uri": artifact_uri, + "container_spec": { + "image_uri": image_uri, + "command": [], + "args": [], + "env": [], + "ports": [], + "predict_route": "", + "health_route": "", + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.upload_model(parent=parent, model=model) + logging.info("Long running operation: %s", response.operation.name) + upload_model_response = response.result(timeout=timeout) + logging.info("upload_model_response: %s", upload_model_response) + model_path = upload_model_response.model + return (model_path, ) + + def _serialize_str(str_value: str) -> str: + if not isinstance(str_value, str): + raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value)))) + return str_value + + import argparse + _parser = argparse.ArgumentParser(prog='Upload model', description='') + _parser.add_argument("--project", dest="project", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--image-uri", dest="image_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--artifact-uri", dest="artifact_uri", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--location", dest="location", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--api-endpoint", dest="api_endpoint", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--timeout", dest="timeout", type=int, required=True, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = upload_model(**_parsed_args) + + _output_serializers = [ + _serialize_str, + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --project + - {inputValue: project} + - --display-name + - {inputValue: display_name} + - --image-uri + - {inputValue: image_uri} + - --artifact-uri + - {inputValue: artifact_uri} + - --location + - {inputValue: location} + - --api-endpoint + - {inputValue: api_endpoint} + - --timeout + - {inputValue: timeout} + - '----output-paths' + - {outputPath: model_id} diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py new file mode 100644 index 0000000..27f5fea --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/deploy_model.py @@ -0,0 +1,95 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def deploy_model( + project: str, + endpoint_disp_name: str, + model_name: str, + deployed_model_display_name: str, + location: str = "us-central1", + api_endpoint: str = "us-central1-aiplatform.googleapis.com", + timeout: int = 7200, + ): + + import logging + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + + def create_endpoint( + project: str, + display_name: str, + client, + location: str = "us-central1", + api_endpoint: str = "us-central1-aiplatform.googleapis.com", + timeout: int = 300, + ): + + endpoint = {"display_name": display_name} + parent = f"projects/{project}/locations/{location}" + response = client.create_endpoint(parent=parent, endpoint=endpoint) + print("Long running operation:", response.operation.name) + create_endpoint_response = response.result(timeout=timeout) + print("create_endpoint_response:", create_endpoint_response) + endpoint_name = create_endpoint_response.name + logging.info('endpoint name: %s', endpoint_name) + return endpoint_name + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.EndpointServiceClient(client_options=client_options) + + # create endpoint + logging.info('creating endpoint %s', endpoint_disp_name) + endpoint_path = create_endpoint(project, endpoint_disp_name, client) + logging.info("using endpoint path ID %s", endpoint_path) + + deployed_model = { + # format: 'projects/{project}/locations/{location}/models/{model}' + "model": model_name, + "display_name": deployed_model_display_name, + # `dedicated_resources` must be used for non-AutoML models + "dedicated_resources": { + "min_replica_count": 1, + "machine_spec": { + "machine_type": "n1-standard-2", + # Accelerators can be used only if the model specifies a GPU image. + # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + # 'accelerator_count': 1, + }, + }, + } + # key '0' assigns traffic for the newly deployed model + # Traffic percentage values must add up to 100 + # Leave dictionary empty if endpoint should not accept any traffic + traffic_split = {"0": 100} +# endpoint = client.endpoint_path( +# project=project, location=location, endpoint=endpoint_id +# ) + response = client.deploy_model( + endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split + ) + logging.info("Long running operation: %s", response.operation.name) + deploy_model_response = response.result(timeout=timeout) + logging.info("deploy_model_response: %s", deploy_model_response) + # TODO: output status info in some form + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(deploy_model, + output_component_file='../model_deploy_component.yaml', + base_image='gcr.io/google-samples/bw-aiplatform:v1') diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py new file mode 100644 index 0000000..c90586e --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/serving/model_upload.py @@ -0,0 +1,76 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# from kfp.components import OutputPath +from typing import NamedTuple + + +def upload_model( + project: str, + display_name: str, + image_uri: str, + artifact_uri: str, + location: str, # "us-central1", + api_endpoint: str, #"us-central1-aiplatform.googleapis.com", + timeout: int, # 1800, + ) -> NamedTuple('Outputs', [('model_id', str)]): + + import logging + import subprocess + from google.cloud import aiplatform + + logging.getLogger().setLevel(logging.INFO) + metadata_schema_uri = "" + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.ModelServiceClient(client_options=client_options) + model = { + "display_name": display_name, + "metadata_schema_uri": metadata_schema_uri, + # The artifact_uri should be the path to a GCS directory containing + # saved model artifacts. The bucket must be accessible for the + # project's AI Platform service account and in the same region as + # the api endpoint. + "artifact_uri": artifact_uri, + "container_spec": { + "image_uri": image_uri, + "command": [], + "args": [], + "env": [], + "ports": [], + "predict_route": "", + "health_route": "", + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.upload_model(parent=parent, model=model) + logging.info("Long running operation: %s", response.operation.name) + upload_model_response = response.result(timeout=timeout) + logging.info("upload_model_response: %s", upload_model_response) + model_path = upload_model_response.model + return (model_path, ) + + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(upload_model, + output_component_file='../model_upload_component.yaml', + base_image='gcr.io/google-samples/bw-aiplatform:v1') + + +# gcloud beta ai models upload --region=us-central1 --display-name=bw2 --container-image-uri=us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest --artifact-uri=gs://aju-pipelines/ktune13/f8515c75-32b7-47a4-af70-5ff24362eccc/0/bwmodel/trained_model/export/bikesw/1603733739 \ No newline at end of file diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/__init__.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/model.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/model.py new file mode 100644 index 0000000..1f4e022 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/bwmodel/model.py @@ -0,0 +1,111 @@ +# Copyright 2020 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Adapted in part from: +# https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb +# by Valliappa Lakshmanan. (See that repo for more info about the accompanying book, +# "Data Science on the Google Cloud Platform", from O'Reilly.) + +import tensorflow as tf + + +CSV_COLUMNS = ('duration,end_station_id,bike_id,ts,day_of_week,start_station_id' + + ',start_latitude,start_longitude,end_latitude,end_longitude' + + ',euclidean,loc_cross,prcp,max,min,temp,dewp').split(',') +LABEL_COLUMN = 'duration' +DEFAULTS = [[0.0], ['na'], ['na'], [0.0], ['na'], ['na'], + [0.0], [0.0], [0.0], [0.0], + [0.0], ['na'], [0.0], [0.0], [0.0], [0.0], [0.0]] + +def load_dataset(pattern, batch_size=1): + return tf.data.experimental.make_csv_dataset(pattern, batch_size, CSV_COLUMNS, DEFAULTS) + +def features_and_labels(features): + label = features.pop('duration') # this is what we will train for + features.pop('bike_id') + return features, label + +def read_dataset(pattern, batch_size, mode=tf.estimator.ModeKeys.TRAIN, truncate=None): + dataset = load_dataset(pattern, batch_size) + dataset = dataset.map(features_and_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE) + if mode == tf.estimator.ModeKeys.TRAIN: + dataset = dataset.repeat().shuffle(batch_size*10) + # dataset = dataset.repeat() + dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) + # dataset = dataset.prefetch(1) + if truncate is not None: + dataset = dataset.take(truncate) + return dataset + +def get_layers(): + + # duration,end_station_id,bike_id,ts,day_of_week,start_station_id,start_latitude,start_longitude,end_latitude,end_longitude, + # euclidean,loc_cross,prcp,max,min,temp,dewp + real = { + colname : tf.feature_column.numeric_column(colname) + for colname in + # ('ts,start_latitude,start_longitude,end_latitude,end_longitude,euclidean,prcp,max,min,temp,dewp').split(',') + # ('ts,euclidean,prcp,max,min,temp,dewp').split(',') + ('euclidean,prcp,max,min,temp,dewp').split(',') + } + sparse = { + 'day_of_week': tf.feature_column.categorical_column_with_vocabulary_list('day_of_week', + vocabulary_list='1,2,3,4,5,6,7'.split(',')), + 'end_station_id' : tf.feature_column.categorical_column_with_hash_bucket( + 'end_station_id', hash_bucket_size=800), + 'start_station_id' : tf.feature_column.categorical_column_with_hash_bucket( + 'start_station_id', hash_bucket_size=800), + 'loc_cross' : tf.feature_column.categorical_column_with_hash_bucket( + 'loc_cross', hash_bucket_size=21000), + # 'bike_id' : tf.feature_column.categorical_column_with_hash_bucket('bike_id', hash_bucket_size=14000) + } + inputs = { + colname : tf.keras.layers.Input(name=colname, shape=(), dtype='float32') + for colname in real.keys() + } + inputs.update({'ts': tf.keras.layers.Input(name='ts', shape=(), dtype='float64')}) + inputs.update({ + colname : tf.keras.layers.Input(name=colname, shape=(), dtype='string') + for colname in sparse.keys() + }) + # embed all the sparse columns + embed = { + 'embed_{}'.format(colname) : tf.feature_column.embedding_column(col, 10) + for colname, col in sparse.items() + } + real.update(embed) + # one-hot encode the sparse columns + sparse = { + colname : tf.feature_column.indicator_column(col) + for colname, col in sparse.items() + } + return inputs, sparse, real + +# Build a wide-and-deep model. +def wide_and_deep_classifier(inputs, linear_feature_columns, dnn_feature_columns, + num_hidden_layers, dnn_hidden_units1, learning_rate): + deep = tf.keras.layers.DenseFeatures(dnn_feature_columns, name='deep_inputs')(inputs) + layers = [dnn_hidden_units1] + if num_hidden_layers > 1: + layers += [int(dnn_hidden_units1/(x*2)) for x in range(1, num_hidden_layers)] + for layerno, numnodes in enumerate(layers): + deep = tf.keras.layers.Dense(numnodes, activation='relu', name='dnn_{}'.format(layerno+1))(deep) + wide = tf.keras.layers.DenseFeatures(linear_feature_columns, name='wide_inputs')(inputs) + both = tf.keras.layers.concatenate([deep, wide], name='both') + output = tf.keras.layers.Dense(1, name='dur')(both) + model = tf.keras.Model(inputs, output) + optimizer = tf.keras.optimizers.RMSprop(learning_rate) + model.compile(loss='mse', optimizer=optimizer, + metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError()]) + return model diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py new file mode 100644 index 0000000..2c004a2 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/create_training_job.py @@ -0,0 +1,147 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + + +def create_training_pipeline_custom_job( + project: str, + display_name: str, + model_display_name: str, + train_container_type: str, + executor_image_uri: str, + package_uri: str, + python_module: str, + container_image_uri: str, + base_output_directory_prefix: str, + prediction_image_uri: str, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest' + location: str, # "us-central1" + api_endpoint: str, # "us-central1-aiplatform.googleapis.com", + data_dir: str, + hptune_dict: str, +) -> NamedTuple('Outputs', [('model_id', str), ('model_dispname', str)]): + + import logging + import subprocess + import time + + from google.cloud import aiplatform + from google.protobuf import json_format + from google.protobuf.struct_pb2 import Value + from google.cloud.aiplatform_v1beta1.types import pipeline_state + + logging.getLogger().setLevel(logging.INFO) + + # The AI Platform services require regional API endpoints. + client_options = {"api_endpoint": api_endpoint} + # Initialize client that will be used to create and send requests. + # This client only needs to be created once, and can be reused for multiple requests. + client = aiplatform.gapic.PipelineServiceClient(client_options=client_options) + + # TODO: more error checking before kicking off the job + if train_container_type == 'prebuilt': + if package_uri == 'none' or executor_image_uri == 'none': + logging.warning('unspecified URI; exiting') + exit(1) + python_package_spec = { + "executor_image_uri": executor_image_uri, + "package_uris": [package_uri], + "python_module": python_module, + "args": [f"--data-dir={data_dir}", + f"--hptune-dict={hptune_dict}"]} + worker_pool_spec = { + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "python_package_spec": python_package_spec, + } + elif train_container_type == 'custom': + if container_image_uri == 'none': + logging.warning('unspecified container_image_uri; exiting') + exit(1) + container_spec = { + # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job + "imageUri": container_image_uri, + "args": [ + # AIP_MODEL_DIR is set by the service according to baseOutputDirectory. + "--model_dir=$(AIP_MODEL_DIR)", + ]} + worker_pool_spec = { + "machine_spec": { + "machine_type": "n1-standard-16", + "accelerator_type": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80, + "accelerator_count": 2, + }, + "replica_count": 1, + "container_spec": container_spec, + } + else: + logging.warning('unknown train_container_type; exiting') + exit(1) + + training_task_inputs_dict = { + "workerPoolSpecs": [ + worker_pool_spec + ], + "baseOutputDirectory": { + # The GCS location for outputs must be accessible by the project's AI Platform service account. + "output_uri_prefix": base_output_directory_prefix + }, + } + training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value()) + + training_task_definition = "gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml" + + training_pipeline = { + "display_name": display_name, + "training_task_definition": training_task_definition, + "training_task_inputs": training_task_inputs, + "model_to_upload": { + "display_name": model_display_name, + "container_spec": {"image_uri": prediction_image_uri}, + }, + } + parent = f"projects/{project}/locations/{location}" + response = client.create_training_pipeline( + parent=parent, training_pipeline=training_pipeline + ) + logging.info("training pipeline request response: %s", response) + + SLEEP_INTERVAL = 100 + + training_pipeline_name = response.name + logging.info("training pipeline name: %s", training_pipeline_name) + # Poll periodically until training completes + while True: + mresponse = client.get_training_pipeline(name=training_pipeline_name) + logging.info('mresponse: %s', mresponse) + logging.info('job state: %s', mresponse.state) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED: + logging.warning('training pipeline failed: %s', mresponse) + exit(1) + if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED: + logging.info('training finished') + model_name = mresponse.model_to_upload.name + return (model_name, model_display_name) + else: + time.sleep(SLEEP_INTERVAL) + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(create_training_pipeline_custom_job, + output_component_file='../model_train_component.yaml', + base_image='gcr.io/google-samples/bw-aiplatform:v1') diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/dist/bw-trainer-0.1.tar.gz b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/dist/bw-trainer-0.1.tar.gz new file mode 100644 index 0000000..c21c6dd Binary files /dev/null and b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/dist/bw-trainer-0.1.tar.gz differ diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/setup.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/setup.py new file mode 100644 index 0000000..494c34b --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/setup.py @@ -0,0 +1,27 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import find_packages +from setuptools import setup + +REQUIRED_PACKAGES = [] + +setup( + name='bw-trainer', + version='0.1', + install_requires=REQUIRED_PACKAGES, + packages=find_packages(), + include_package_data=True, + description='bikes & weather training application.' +) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/__init__.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py new file mode 100644 index 0000000..85e4218 --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/training/trainer/task.py @@ -0,0 +1,147 @@ +# Copyright 2021 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import logging +import json +import os +import time + +import tensorflow as tf + +import bwmodel.model as bwmodel + +DEVELOP_MODE = False +NBUCKETS = 5 # for embeddings +NUM_EXAMPLES = 1000*1000 * 20 # assume 20 million examples + +STRATEGY = tf.distribute.MirroredStrategy() +TRAIN_BATCH_SIZE = 64 * STRATEGY.num_replicas_in_sync + + +def create_model(learning_rate, hidden_size, num_hidden_layers): + + inputs, sparse, real = bwmodel.get_layers() + + logging.info('sparse keys: %s', sparse.keys()) + logging.info('real keys: %s', real.keys()) + + model = None + print('num replicas...') + print(STRATEGY.num_replicas_in_sync) + + with STRATEGY.scope(): # hmmm + model = bwmodel.wide_and_deep_classifier( + inputs, + linear_feature_columns=sparse.values(), + dnn_feature_columns=real.values(), + num_hidden_layers=num_hidden_layers, + dnn_hidden_units1=hidden_size, + learning_rate=learning_rate) + + + model.summary() + return model + +def run_training( + # epochs: int, + data_dir: str, + # steps_per_epoch: int, + hptune_dict: str + ): + + if 'AIP_MODEL_DIR' not in os.environ: + raise KeyError( + 'The `AIP_MODEL_DIR` environment variable has not been' + + 'set. See https://cloud.google.com/ai-platform-unified/docs/tutorials/image-recognition-custom/training' + ) + + logging.getLogger().setLevel(logging.INFO) + + # data_dir = 'gs://aju-dev-demos-codelabs/bikes_weather/' + + hptune_info = json.loads(str(args.hptune_dict)) + logging.info('hptune_info: %s', hptune_info) + learning_rate = hptune_info['learning_rate'] + hidden_size = hptune_info['hidden_size'] + num_hidden_layers = hptune_info['num_hidden_layers'] + epochs = hptune_info['epochs'] + steps_per_epoch = hptune_info['steps_per_epoch'] + logging.info('using: learning rate %s, hidden size %s, first hidden layer %s', + learning_rate, hidden_size, num_hidden_layers) + logging.info('epochs: %s', epochs) + logging.info('Tensorflow version %s', tf.__version__) + + TRAIN_DATA_PATTERN = data_dir + "train*" + EVAL_DATA_PATTERN = data_dir + "test*" + + # OUTPUT_DIR = '{}/bwmodel/trained_model'.format(args.workdir) + OUTPUT_DIR = os.environ['AIP_MODEL_DIR'] + logging.info('Writing trained model to %s', OUTPUT_DIR) + + train_batch_size = TRAIN_BATCH_SIZE + eval_batch_size = 1000 + if steps_per_epoch == -1: # calc based on dataset size + steps_per_epoch = NUM_EXAMPLES // train_batch_size + else: + steps_per_epoch = steps_per_epoch + logging.info('using %s steps per epoch', steps_per_epoch) + + train_dataset = bwmodel.read_dataset(TRAIN_DATA_PATTERN, train_batch_size) + eval_dataset = bwmodel.read_dataset(EVAL_DATA_PATTERN, eval_batch_size, + tf.estimator.ModeKeys.EVAL, eval_batch_size * 100 * STRATEGY.num_replicas_in_sync + ) + + model = create_model(learning_rate, hidden_size, num_hidden_layers) + + checkpoint_path = '{}checkpoints/bikes_weather.cpt'.format(OUTPUT_DIR) + logging.info("checkpoint path: %s", checkpoint_path) + cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, + save_weights_only=True, + verbose=1) + tb_callback = tf.keras.callbacks.TensorBoard(log_dir='{}/logs'.format(OUTPUT_DIR), + update_freq=20000) + + logging.info("training model....") + history = model.fit(train_dataset, + validation_data=eval_dataset, + validation_steps=eval_batch_size, + epochs=epochs, + steps_per_epoch=steps_per_epoch, + callbacks=[cp_callback, tb_callback] + ) + + tf.saved_model.save(model, OUTPUT_DIR) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + # Input Arguments + # parser.add_argument( + # '--epochs', type=int, default=5) + parser.add_argument( + # e.g. {"num_hidden_layers": 3, "hidden_size": 96, "learning_rate": 0.01} + '--hptune-dict', required=True) + # parser.add_argument( + # '--steps-per-epoch', type=int, + # default=-1) # if set to -1, don't override the normal calcs for this + parser.add_argument( + '--data-dir', default='gs://aju-dev-demos-codelabs/bikes_weather/') + args = parser.parse_args() + + run_training( + # args.epochs, + args.data_dir, + # args.steps_per_epoch, + args.hptune_dict) diff --git a/ml/kubeflow-pipelines/keras_tuner/components/ucaip/ucaip_train_pipeline_spec.json b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/ucaip_train_pipeline_spec.json new file mode 100644 index 0000000..b067f8f --- /dev/null +++ b/ml/kubeflow-pipelines/keras_tuner/components/ucaip/ucaip_train_pipeline_spec.json @@ -0,0 +1,321 @@ +{ + "pipelineSpec": { + "sdkVersion": "kfp-1.4.0", + "deploymentConfig": { + "@type": "type.googleapis.com/ml_pipelines.PipelineDeploymentConfig", + "executors": { + "Deploy model": { + "container": { + "image": "gcr.io/google-samples/bw-aiplatform:v1", + "args": [ + "--project", + "{{$.inputs.parameters['project']}}", + "--endpoint-disp-name", + "{{$.inputs.parameters['endpoint_disp_name']}}", + "--model-name", + "{{$.inputs.parameters['model_name']}}", + "--deployed-model-display-name", + "{{$.inputs.parameters['deployed_model_display_name']}}", + "--location", + "{{$.inputs.parameters['location']}}", + "--api-endpoint", + "{{$.inputs.parameters['api_endpoint']}}", + "--timeout", + "{{$.inputs.parameters['timeout']}}" + ], + "command": [ + "sh", + "-ec", + "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def deploy_model(\n project,\n endpoint_disp_name,\n model_name,\n deployed_model_display_name,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 7200,\n ):\n\n import logging\n from google.cloud import aiplatform\n\n logging.getLogger().setLevel(logging.INFO)\n\n def create_endpoint(\n project,\n display_name,\n client,\n location = \"us-central1\",\n api_endpoint = \"us-central1-aiplatform.googleapis.com\",\n timeout = 300,\n ):\n\n endpoint = {\"display_name\": display_name}\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_endpoint(parent=parent, endpoint=endpoint)\n print(\"Long running operation:\", response.operation.name)\n create_endpoint_response = response.result(timeout=timeout)\n print(\"create_endpoint_response:\", create_endpoint_response)\n endpoint_name = create_endpoint_response.name\n logging.info('endpoint name: %s', endpoint_name)\n return endpoint_name\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.EndpointServiceClient(client_options=client_options)\n\n # create endpoint\n logging.info('creating endpoint %s', endpoint_disp_name)\n endpoint_path = create_endpoint(project, endpoint_disp_name, client)\n logging.info(\"using endpoint path ID %s\", endpoint_path)\n\n deployed_model = {\n # format: 'projects/{project}/locations/{location}/models/{model}'\n \"model\": model_name,\n \"display_name\": deployed_model_display_name,\n # `dedicated_resources` must be used for non-AutoML models\n \"dedicated_resources\": {\n \"min_replica_count\": 1,\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-2\",\n # Accelerators can be used only if the model specifies a GPU image.\n # 'accelerator_type': aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n # 'accelerator_count': 1,\n },\n },\n }\n # key '0' assigns traffic for the newly deployed model\n # Traffic percentage values must add up to 100\n # Leave dictionary empty if endpoint should not accept any traffic\n traffic_split = {\"0\": 100}\n# endpoint = client.endpoint_path(\n# project=project, location=location, endpoint=endpoint_id\n# )\n response = client.deploy_model(\n endpoint=endpoint_path, deployed_model=deployed_model, traffic_split=traffic_split\n )\n logging.info(\"Long running operation: %s\", response.operation.name)\n deploy_model_response = response.result(timeout=timeout)\n logging.info(\"deploy_model_response: %s\", deploy_model_response)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Deploy model', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--endpoint-disp-name\", dest=\"endpoint_disp_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--deployed-model-display-name\", dest=\"deployed_model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"--timeout\", dest=\"timeout\", type=int, required=False, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = deploy_model(**_parsed_args)\n" + ] + } + }, + "Create training pipeline custom job": { + "container": { + "command": [ + "sh", + "-ec", + "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", + "def create_training_pipeline_custom_job(\n project,\n display_name,\n model_display_name,\n train_container_type,\n executor_image_uri,\n package_uri,\n python_module,\n container_image_uri,\n base_output_directory_prefix,\n prediction_image_uri, # 'us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest'\n location, # \"us-central1\"\n api_endpoint, # \"us-central1-aiplatform.googleapis.com\",\n data_dir,\n hptune_dict,\n):\n\n import logging\n import subprocess\n import time\n\n from google.cloud import aiplatform\n from google.protobuf import json_format\n from google.protobuf.struct_pb2 import Value\n from google.cloud.aiplatform_v1beta1.types import pipeline_state\n\n logging.getLogger().setLevel(logging.INFO)\n\n # The AI Platform services require regional API endpoints.\n client_options = {\"api_endpoint\": api_endpoint}\n # Initialize client that will be used to create and send requests.\n # This client only needs to be created once, and can be reused for multiple requests.\n client = aiplatform.gapic.PipelineServiceClient(client_options=client_options)\n\n if train_container_type == 'prebuilt':\n python_package_spec = {\n \"executor_image_uri\": executor_image_uri,\n \"package_uris\": [package_uri],\n \"python_module\": python_module,\n \"args\": [f\"--data-dir={data_dir}\",\n f\"--hptune-dict={hptune_dict}\"]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"python_package_spec\": python_package_spec,\n }\n elif train_container_type == 'custom':\n container_spec = {\n # A working docker image can be found at gs://cloud-samples-data/ai-platform/mnist_tfrecord/custom_job\n \"imageUri\": container_image_uri,\n \"args\": [\n # AIP_MODEL_DIR is set by the service according to baseOutputDirectory.\n \"--model_dir=$(AIP_MODEL_DIR)\",\n ]}\n worker_pool_spec = {\n \"machine_spec\": {\n \"machine_type\": \"n1-standard-16\",\n \"accelerator_type\": aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_K80,\n \"accelerator_count\": 2,\n },\n \"replica_count\": 1,\n \"container_spec\": container_spec,\n }\n else:\n logging.warning('unknown train_container_type; exiting')\n exit(1)\n\n training_task_inputs_dict = {\n \"workerPoolSpecs\": [\n worker_pool_spec\n ],\n \"baseOutputDirectory\": {\n # The GCS location for outputs must be accessible by the project's AI Platform service account.\n \"output_uri_prefix\": base_output_directory_prefix\n },\n }\n training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value())\n\n training_task_definition = \"gs://google-cloud-aiplatform/schema/trainingjob/definition/custom_task_1.0.0.yaml\"\n\n training_pipeline = {\n \"display_name\": display_name,\n \"training_task_definition\": training_task_definition,\n \"training_task_inputs\": training_task_inputs,\n \"model_to_upload\": {\n \"display_name\": model_display_name,\n \"container_spec\": {\"image_uri\": prediction_image_uri},\n },\n }\n parent = f\"projects/{project}/locations/{location}\"\n response = client.create_training_pipeline(\n parent=parent, training_pipeline=training_pipeline\n )\n logging.info(\"training pipeline request response: %s\", response)\n\n SLEEP_INTERVAL = 100\n\n training_pipeline_name = response.name\n logging.info(\"training pipeline name: %s\", training_pipeline_name)\n # Poll periodically until training completes\n while True:\n mresponse = client.get_training_pipeline(name=training_pipeline_name)\n logging.info('mresponse: %s', mresponse)\n logging.info('job state: %s', mresponse.state)\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_FAILED:\n logging.warning('training pipeline failed: %s', mresponse)\n exit(1)\n if mresponse.state == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED:\n logging.info('training finished')\n model_name = mresponse.model_to_upload.name\n return (model_name, model_display_name)\n else:\n time.sleep(SLEEP_INTERVAL)\n\ndef _serialize_str(str_value: str) -> str:\n if not isinstance(str_value, str):\n raise TypeError('Value \"{}\" has type \"{}\" instead of str.'.format(str(str_value), str(type(str_value))))\n return str_value\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Create training pipeline custom job', description='')\n_parser.add_argument(\"--project\", dest=\"project\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--display-name\", dest=\"display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model-display-name\", dest=\"model_display_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-container-type\", dest=\"train_container_type\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--executor-image-uri\", dest=\"executor_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--package-uri\", dest=\"package_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--python-module\", dest=\"python_module\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--container-image-uri\", dest=\"container_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--base-output-directory-prefix\", dest=\"base_output_directory_prefix\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--prediction-image-uri\", dest=\"prediction_image_uri\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--location\", dest=\"location\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--api-endpoint\", dest=\"api_endpoint\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--data-dir\", dest=\"data_dir\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--hptune-dict\", dest=\"hptune_dict\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=2)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = create_training_pipeline_custom_job(**_parsed_args)\n\n_output_serializers = [\n _serialize_str,\n _serialize_str,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, 'w') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n" + ], + "args": [ + "--project", + "{{$.inputs.parameters['project']}}", + "--display-name", + "{{$.inputs.parameters['display_name']}}", + "--model-display-name", + "{{$.inputs.parameters['model_display_name']}}", + "--train-container-type", + "{{$.inputs.parameters['train_container_type']}}", + "--executor-image-uri", + "{{$.inputs.parameters['executor_image_uri']}}", + "--package-uri", + "{{$.inputs.parameters['package_uri']}}", + "--python-module", + "{{$.inputs.parameters['python_module']}}", + "--container-image-uri", + "{{$.inputs.parameters['container_image_uri']}}", + "--base-output-directory-prefix", + "{{$.inputs.parameters['base_output_directory_prefix']}}", + "--prediction-image-uri", + "{{$.inputs.parameters['prediction_image_uri']}}", + "--location", + "{{$.inputs.parameters['location']}}", + "--api-endpoint", + "{{$.inputs.parameters['api_endpoint']}}", + "--data-dir", + "{{$.inputs.parameters['data_dir']}}", + "--hptune-dict", + "{{$.inputs.parameters['hptune_dict']}}", + "----output-paths", + "{{$.outputs.parameters['model_id'].output_file}}", + "{{$.outputs.parameters['model_dispname'].output_file}}" + ], + "image": "gcr.io/google-samples/bw-aiplatform:v1" + } + } + } + }, + "runtimeParameters": { + "executor_image_uri": { + "defaultValue": { + "stringValue": "us-docker.pkg.dev/cloud-aiplatform/training/tf-gpu.2-3:latest" + }, + "type": "STRING" + }, + "model_display_name": { + "defaultValue": { + "stringValue": "CHANGE THIS" + }, + "type": "STRING" + }, + "timeout": { + "type": "INT", + "defaultValue": { + "intValue": "7200" + } + }, + "training_display_name": { + "type": "STRING", + "defaultValue": { + "stringValue": "CHANGE THIS" + } + }, + "project": { + "defaultValue": { + "stringValue": "aju-vtests2" + }, + "type": "STRING" + }, + "endpoint_disp_name": { + "defaultValue": { + "stringValue": "CHANGE THIS" + }, + "type": "STRING" + }, + "python_module": { + "defaultValue": { + "stringValue": "trainer.task" + }, + "type": "STRING" + }, + "api_endpoint": { + "type": "STRING", + "defaultValue": { + "stringValue": "us-central1-aiplatform.googleapis.com" + } + }, + "package_uri": { + "defaultValue": { + "stringValue": "gs://aju-pipelines/ucaip/training1/bw-trainer-0.1.tar.gz" + }, + "type": "STRING" + }, + "hptune_dict": { + "type": "STRING", + "defaultValue": { + "stringValue": "{\"num_hidden_layers\": 3, \"hidden_size\": 32, \"learning_rate\": 0.01, \"epochs\": 3, \"steps_per_epoch\": -1}" + } + }, + "location": { + "defaultValue": { + "stringValue": "us-central1" + }, + "type": "STRING" + }, + "base_output_directory_prefix": { + "type": "STRING", + "defaultValue": { + "stringValue": "gs://aju-pipelines/ucaip/training2/" + } + }, + "container_image_uri": { + "type": "STRING" + }, + "data_dir": { + "defaultValue": { + "stringValue": "gs://aju-dev-demos-codelabs/bikes_weather/" + }, + "type": "STRING" + }, + "prediction_image_uri": { + "type": "STRING", + "defaultValue": { + "stringValue": "us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest" + } + }, + "train_container_type": { + "type": "STRING", + "defaultValue": { + "stringValue": "prebuilt" + } + } + }, + "schemaVersion": "v2alpha1", + "tasks": [ + { + "taskInfo": { + "name": "Create training pipeline custom job" + }, + "outputs": { + "parameters": { + "model_id": { + "type": "STRING" + }, + "model_dispname": { + "type": "STRING" + } + } + }, + "inputs": { + "parameters": { + "base_output_directory_prefix": { + "runtimeValue": { + "runtimeParameter": "base_output_directory_prefix" + } + }, + "hptune_dict": { + "runtimeValue": { + "runtimeParameter": "hptune_dict" + } + }, + "prediction_image_uri": { + "runtimeValue": { + "runtimeParameter": "prediction_image_uri" + } + }, + "executor_image_uri": { + "runtimeValue": { + "runtimeParameter": "executor_image_uri" + } + }, + "train_container_type": { + "runtimeValue": { + "runtimeParameter": "train_container_type" + } + }, + "container_image_uri": { + "runtimeValue": { + "runtimeParameter": "container_image_uri" + } + }, + "package_uri": { + "runtimeValue": { + "runtimeParameter": "package_uri" + } + }, + "location": { + "runtimeValue": { + "runtimeParameter": "location" + } + }, + "python_module": { + "runtimeValue": { + "runtimeParameter": "python_module" + } + }, + "api_endpoint": { + "runtimeValue": { + "runtimeParameter": "api_endpoint" + } + }, + "project": { + "runtimeValue": { + "runtimeParameter": "project" + } + }, + "model_display_name": { + "runtimeValue": { + "runtimeParameter": "model_display_name" + } + }, + "display_name": { + "runtimeValue": { + "runtimeParameter": "training_display_name" + } + }, + "data_dir": { + "runtimeValue": { + "runtimeParameter": "data_dir" + } + } + } + }, + "executorLabel": "Create training pipeline custom job" + }, + { + "taskInfo": { + "name": "Deploy model" + }, + "executorLabel": "Deploy model", + "inputs": { + "parameters": { + "model_name": { + "taskOutputParameter": { + "producerTask": "Create training pipeline custom job", + "outputParameterKey": "model_id" + } + }, + "endpoint_disp_name": { + "runtimeValue": { + "runtimeParameter": "endpoint_disp_name" + } + }, + "api_endpoint": { + "runtimeValue": { + "runtimeParameter": "api_endpoint" + } + }, + "timeout": { + "runtimeValue": { + "runtimeParameter": "timeout" + } + }, + "location": { + "runtimeValue": { + "runtimeParameter": "location" + } + }, + "deployed_model_display_name": { + "runtimeValue": { + "runtimeParameter": "model_display_name" + } + }, + "project": { + "runtimeValue": { + "runtimeParameter": "project" + } + } + } + } + } + ], + "pipelineInfo": { + "name": "ucaip-model-train" + } + }, + "runtimeConfig": { + "gcsOutputDirectory": "gs://aju-pipelines/pipeline_root/ucaiptests" + } +} \ No newline at end of file