Tables e2e (#60)

amygdala · web-flow · commit d186a6f601fd · 2020-04-23T11:07:35.000-07:00
kfp bug?
diff --git a/ml/automl/tables/kfp_e2e/README.md b/ml/automl/tables/kfp_e2e/README.md
@@ -76,9 +76,9 @@ Once a Pipelines installation is running, we can upload the example AutoML Table
 Click on **Pipelines** in the left nav bar of the Pipelines Dashboard.  Click on **Upload Pipeline**. 
 
 - For Cloud AI Platform Pipelines, upload [`tables_pipeline_caip.py.tar.gz`][36], from this directory.  This archive points to the compiled version of [this pipeline][37], specified and compiled using the [Kubeflow Pipelines SDK][38].  
-- For Kubeflow Pipelines on a Kubeflow installation, upload  [`tables_pipeline_kf.py.tar.gz`][39].  This archive points to the compiled version of [this pipeline][40].
+- For Kubeflow Pipelines on a Kubeflow installation, upload  [`tables_pipeline_kf.py.tar.gz`][39].  This archive points to the compiled version of [this pipeline][40]. **To run this example on a KF installation, you will need to give the `<deployment-name>-user@<project-id>.iam.gserviceaccount.com` service account `AutoML Admin` privileges**.
 
-> Note: The difference between the two pipelines relates to how GCP authentication is handled.  For the Kubeflow pipeline, we’ve added `.apply(gcp.use_gcp_secret('user-gcp-sa'))` annotations to the pipeline steps. This tells the pipeline to use the mounted _secret_—set up during the installation process— that provides GCP account credentials.  With the Cloud AI Platform Pipelines installation, the GKE cluster nodes have been set up to use the `cloud-platform` scope. With an upcoming Kubeflow release, specification of the mounted secret will no longer be necessary.
+> Note: The difference between the two pipelines relates to how GCP authentication is handled.  For the Kubeflow pipeline, we’ve added `.apply(gcp.use_gcp_secret('user-gcp-sa'))` annotations to the pipeline steps. This tells the pipeline to use the mounted _secret_—set up during the installation process— that provides GCP account credentials.  With the Cloud AI Platform Pipelines installation, the GKE cluster nodes have been set up to use the `cloud-platform` scope. With recent Kubeflow releases, specification of the mounted secret is no longer necessary, but we include both versions for compatibility.
 
 The uploaded pipeline graph will look similar to this:
 
@@ -88,7 +88,7 @@ The uploaded pipeline graph will look similar to this:
 </figure>
 
 Click the **+Create Run** button to run the pipeline.  You will need to fill in some pipeline parameters.
-Specifically, replace `YOUR_PROJECT_HERE` with the name of your project; replace `YOUR_DATASET_NAME` with the name you want to give your new dataset (make it unique, and use letters, numbers and underscores up to 32 characters); and replace `YOUR_BUCKET_NAME` with the name of a [GCS bucket][41].  This bucket should be in the [same _region_][42] as that specified by the `gcp_region` parameter. E.g., if you keep the default `us-central1` region, your bucket should also be a _regional_ (not multi-regional) bucket in the `us-central1` region. ++double check that this is necessary.++
+Specifically, replace `YOUR_PROJECT_HERE` with the name of your project; replace `YOUR_DATASET_NAME` with the name you want to give your new dataset (make it unique, and use letters, numbers and underscores up to 32 characters); and replace `YOUR_BUCKET_NAME` with the name of a [GCS bucket][41].  Do not include the `gs://` prefix— just enter the name. This bucket should be in the [same _region_][42] as that specified by the `gcp_region` parameter. E.g., if you keep the default `us-central1` region, your bucket should also be a _regional_ (not multi-regional) bucket in the `us-central1` region. ++double check that this is necessary.++
 
  If you want to schedule a recurrent set of runs, you can do that instead.  If your data is in [BigQuery][43]— as is the case for this example pipeline— and has a temporal aspect, you could define a _view_ to reflect that, e.g. to return data from a window over the last `N` days or hours.  Then, the AutoML pipeline could specify ingestion of data from that view, grabbing an updated data window each time the pipeline is run, and building a new model based on that updated window.
 
diff --git a/ml/automl/tables/kfp_e2e/create_model_for_tables/tables_eval_metrics_component.py b/ml/automl/tables/kfp_e2e/create_model_for_tables/tables_eval_metrics_component.py
@@ -19,46 +19,29 @@
 # An example of how the model eval info could be used to make decisions aboiut whether or not
 # to deploy the model.
 def automl_eval_metrics(
-  # gcp_project_id: str,
-  # gcp_region: str,
-  # model_display_name: str,
   eval_data_path: InputPath('evals'),
   mlpipeline_ui_metadata_path: OutputPath('UI_metadata'),
   mlpipeline_metrics_path: OutputPath('UI_metrics'),
-  # api_endpoint: str = None,
   # thresholds: str = '{"au_prc": 0.9}',
-  thresholds: str = '{"mean_absolute_error": 450}',
+  thresholds: str = '{"mean_absolute_error": 460}',
   confidence_threshold: float = 0.5  # for classification
 
-) -> NamedTuple('Outputs', [('deploy', bool)]):
+# ) -> NamedTuple('Outputs', [('deploy', str)]):  # this gives the same result
+) -> NamedTuple('Outputs', [('deploy', 'String')]):
   import subprocess
   import sys
-  # we could build a base image that includes these libraries if we don't want to do
-  # the dynamic installation when the step runs.
-  # subprocess.run([sys.executable, '-m', 'pip', 'install', 'googleapis-common-protos==1.6.0',
-  #     '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
-  # subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.9.0',
-  #    'google-cloud-storage',
-  #    '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
+  subprocess.run([sys.executable, '-m', 'pip', 'install', 'googleapis-common-protos==1.6.0',
+      '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
+  subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.9.0',
+     'google-cloud-storage',
+     '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
 
-  # import google
+  import google
   import json
   import logging
   import pickle
-  # from google.api_core.client_options import ClientOptions
-  # from google.api_core import exceptions
-  # from google.cloud import automl_v1beta1 as automl
-  # from google.cloud import storage
 
   logging.getLogger().setLevel(logging.INFO)  # TODO: make level configurable
-  # TODO: we could instead check for region 'eu' and use 'eu-automl.googleapis.com:443'endpoint
-  # in that case, instead of requiring endpoint to be specified.
-  # if api_endpoint:
-  #   client_options = ClientOptions(api_endpoint=api_endpoint)
-  #   client = automl.TablesClient(project=gcp_project_id, region=gcp_region,
-  #       client_options=client_options)
-  # else:
-  #   client = automl.TablesClient(project=gcp_project_id, region=gcp_region)
 
   thresholds_dict = json.loads(thresholds)
   logging.info('thresholds dict: {}'.format(thresholds_dict))
@@ -78,12 +61,12 @@ def regression_threshold_check(eval_info):
         if eresults[k] > v:
           logging.info('{} > {}; returning False'.format(
               eresults[k], v))
-          return (False, eresults)
+          return ('False', eresults)
       elif eresults[k] < v:
         logging.info('{} < {}; returning False'.format(
             eresults[k], v))
-        return (False, eresults)
-    return (True, eresults)
+        return ('False', eresults)
+    return ('deploy', eresults)
 
   def classif_threshold_check(eval_info):
     eresults = {}
@@ -108,13 +91,13 @@ def classif_threshold_check(eval_info):
         if eresults[k] > v:
           logging.info('{} > {}; returning False'.format(
               eresults[k], v))
-          return (False, eresults)
+          return ('False', eresults)
       else:
         if eresults[k] < v:
           logging.info('{} < {}; returning False'.format(
               eresults[k], v))
-          return (False, eresults)
-    return (True, eresults)
+          return ('False', eresults)
+    return ('deploy', eresults)
 
   with open(eval_data_path, 'rb') as f:
     logging.info('successfully opened eval_data_path {}'.format(eval_data_path))
@@ -177,13 +160,13 @@ def classif_threshold_check(eval_info):
           mlpipeline_ui_metadata_file.write(json.dumps(metadata))
         logging.info('deploy flag: {}'.format(res))
         return res
-      return True
+      return 'deploy'
     except Exception as e:
       logging.warning(e)
       # If can't reconstruct the eval, or don't have thresholds defined,
       # return True as a signal to deploy.
       # TODO: is this the right default?
-      return True
+      return 'deploy'
 
 
 if __name__ == '__main__':
diff --git a/ml/automl/tables/kfp_e2e/create_model_for_tables/tables_eval_metrics_component.yaml b/ml/automl/tables/kfp_e2e/create_model_for_tables/tables_eval_metrics_component.yaml
@@ -4,7 +4,7 @@ inputs:
   type: evals
 - name: thresholds
   type: String
-  default: '{"mean_absolute_error": 450}'
+  default: '{"mean_absolute_error": 460}'
   optional: true
 - name: confidence_threshold
   type: Float
@@ -16,7 +16,7 @@ outputs:
 - name: mlpipeline_metrics
   type: UI_metrics
 - name: deploy
-  type: Boolean
+  type: String
 implementation:
   container:
     image: python:3.7
@@ -25,64 +25,35 @@ implementation:
     - -u
     - -c
     - |
-      class OutputPath:
-          '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.'''
-          def __init__(self, type=None):
-              self.type = type
-
       def _make_parent_dirs_and_return_path(file_path: str):
           import os
           os.makedirs(os.path.dirname(file_path), exist_ok=True)
           return file_path
 
-      class InputPath:
-          '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.'''
-          def __init__(self, type=None):
-              self.type = type
-
-      from typing import NamedTuple
-
       def automl_eval_metrics(
-        # gcp_project_id: str,
-        # gcp_region: str,
-        # model_display_name: str,
-        eval_data_path: InputPath('evals'),
-        mlpipeline_ui_metadata_path: OutputPath('UI_metadata'),
-        mlpipeline_metrics_path: OutputPath('UI_metrics'),
-        # api_endpoint: str = None,
+        eval_data_path ,
+        mlpipeline_ui_metadata_path ,
+        mlpipeline_metrics_path ,
         # thresholds: str = '{"au_prc": 0.9}',
-        thresholds: str = '{"mean_absolute_error": 450}',
-        confidence_threshold: float = 0.5  # for classification
+        thresholds  = '{"mean_absolute_error": 460}',
+        confidence_threshold  = 0.5  # for classification
 
-      ) -> NamedTuple('Outputs', [('deploy', bool)]):
+      # ) -> NamedTuple('Outputs', [('deploy', str)]):
+      )    :
         import subprocess
         import sys
-        # we could build a base image that includes these libraries if we don't want to do
-        # the dynamic installation when the step runs.
-        # subprocess.run([sys.executable, '-m', 'pip', 'install', 'googleapis-common-protos==1.6.0',
-        #     '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
-        # subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.9.0',
-        #    'google-cloud-storage',
-        #    '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
+        subprocess.run([sys.executable, '-m', 'pip', 'install', 'googleapis-common-protos==1.6.0',
+            '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
+        subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.9.0',
+           'google-cloud-storage',
+           '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True)
 
-        # import google
+        import google
         import json
         import logging
         import pickle
-        # from google.api_core.client_options import ClientOptions
-        # from google.api_core import exceptions
-        # from google.cloud import automl_v1beta1 as automl
-        # from google.cloud import storage
 
         logging.getLogger().setLevel(logging.INFO)  # TODO: make level configurable
-        # TODO: we could instead check for region 'eu' and use 'eu-automl.googleapis.com:443'endpoint
-        # in that case, instead of requiring endpoint to be specified.
-        # if api_endpoint:
-        #   client_options = ClientOptions(api_endpoint=api_endpoint)
-        #   client = automl.TablesClient(project=gcp_project_id, region=gcp_region,
-        #       client_options=client_options)
-        # else:
-        #   client = automl.TablesClient(project=gcp_project_id, region=gcp_region)
 
         thresholds_dict = json.loads(thresholds)
         logging.info('thresholds dict: {}'.format(thresholds_dict))
@@ -102,12 +73,12 @@ implementation:
               if eresults[k] > v:
                 logging.info('{} > {}; returning False'.format(
                     eresults[k], v))
-                return (False, eresults)
+                return ('False', eresults)
             elif eresults[k] < v:
               logging.info('{} < {}; returning False'.format(
                   eresults[k], v))
-              return (False, eresults)
-          return (True, eresults)
+              return ('False', eresults)
+          return ('deploy', eresults)
 
         def classif_threshold_check(eval_info):
           eresults = {}
@@ -132,13 +103,13 @@ implementation:
               if eresults[k] > v:
                 logging.info('{} > {}; returning False'.format(
                     eresults[k], v))
-                return (False, eresults)
+                return ('False', eresults)
             else:
               if eresults[k] < v:
                 logging.info('{} < {}; returning False'.format(
                     eresults[k], v))
-                return (False, eresults)
-          return (True, eresults)
+                return ('False', eresults)
+          return ('deploy', eresults)
 
         with open(eval_data_path, 'rb') as f:
           logging.info('successfully opened eval_data_path {}'.format(eval_data_path))
@@ -201,20 +172,18 @@ implementation:
                 mlpipeline_ui_metadata_file.write(json.dumps(metadata))
               logging.info('deploy flag: {}'.format(res))
               return res
-            return True
+            return 'deploy'
           except Exception as e:
             logging.warning(e)
             # If can't reconstruct the eval, or don't have thresholds defined,
             # return True as a signal to deploy.
             # TODO: is this the right default?
-            return True
+            return 'deploy'
 
-      def _serialize_bool(bool_value: bool) -> str:
-          if isinstance(bool_value, str):
-              return bool_value
-          if not isinstance(bool_value, bool):
-              raise TypeError('Value "{}" has type "{}" instead of bool.'.format(str(bool_value), str(type(bool_value))))
-          return str(bool_value)
+      def _serialize_str(str_value: str) -> str:
+          if not isinstance(str_value, str):
+              raise TypeError('Value "{}" has type "{}" instead of str.'.format(str(str_value), str(type(str_value))))
+          return str_value
 
       import argparse
       _parser = argparse.ArgumentParser(prog='Automl eval metrics', description='')
@@ -229,11 +198,8 @@ implementation:
 
       _outputs = automl_eval_metrics(**_parsed_args)
 
-      if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):
-          _outputs = [_outputs]
-
       _output_serializers = [
-          _serialize_bool,
+          _serialize_str,
 
       ]
 
diff --git a/ml/automl/tables/kfp_e2e/tables_pipeline_caip.py b/ml/automl/tables/kfp_e2e/tables_pipeline_caip.py
@@ -126,16 +126,11 @@ def automl_tables(  #pylint: disable=unused-argument
     )
 
   eval_metrics = eval_metrics_op(
-    # gcp_project_id=gcp_project_id,
-    # gcp_region=gcp_region,
-    # bucket_name=bucket_name,
-    # api_endpoint=api_endpoint,
-    # model_display_name=train_model.outputs['model_display_name'],
     thresholds=thresholds,
     eval_data=eval_model.outputs['eval_data'],
     )
 
-  with dsl.Condition(eval_metrics.outputs['deploy'] == True):
+  with dsl.Condition(eval_metrics.outputs['deploy'] == 'True'):
     deploy_model = deploy_model_op(
       gcp_project_id=gcp_project_id,
       gcp_region=gcp_region,
diff --git a/ml/automl/tables/kfp_e2e/tables_pipeline_caip.py.tar.gz b/ml/automl/tables/kfp_e2e/tables_pipeline_caip.py.tar.gz
diff --git a/ml/automl/tables/kfp_e2e/tables_pipeline_kf.py b/ml/automl/tables/kfp_e2e/tables_pipeline_kf.py
@@ -126,16 +126,11 @@ def automl_tables(  #pylint: disable=unused-argument
     ).apply(gcp.use_gcp_secret('user-gcp-sa'))
 
   eval_metrics = eval_metrics_op(
-    # gcp_project_id=gcp_project_id,
-    # gcp_region=gcp_region,
-    # bucket_name=bucket_name,
-    # api_endpoint=api_endpoint,
-    # model_display_name=train_model.outputs['model_display_name'],
     thresholds=thresholds,
     eval_data=eval_model.outputs['eval_data'],
     ).apply(gcp.use_gcp_secret('user-gcp-sa'))
 
-  with dsl.Condition(eval_metrics.outputs['deploy'] == True):
+  with dsl.Condition(eval_metrics.outputs['deploy'] == 'd'):
     deploy_model = deploy_model_op(
       gcp_project_id=gcp_project_id,
       gcp_region=gcp_region,
diff --git a/ml/automl/tables/kfp_e2e/tables_pipeline_kf.py.tar.gz b/ml/automl/tables/kfp_e2e/tables_pipeline_kf.py.tar.gz