begin addressing comments

Symmetries · Symmetries · commit 739114a595c9 · 2020-06-08T10:24:56.000-04:00
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
@@ -1,4 +1,4 @@
-from random import choice, choices, randint, seed
+import random
 import sys
 
 from time import time_ns
@@ -19,10 +19,10 @@
 upload = True  # Whether to upload data to BigQuery
 
 # Check whether or not results should be uploaded
-try:
-    sys.argv[2]
+if len(sys.arv) > 1:
     upload = False
-except IndexError:
+    print("Not uploading results to BigQuery")
+else:
     print("Results will be uploaded to BigQuery")
 
 table = "bigquery-public-data.new_york_citibike.citibike_trips"
@@ -37,59 +37,60 @@
 # START MAKING DATA DIRTY
 
 
-def random_select(items, cum_weights):
+def random_select(items, weights):
     '''Picks an item according to the cumulative weights'''
-    return choices(items, cum_weights=cum_weights, k=1)[0]
+    return random.choices(items, weights=weights, k=1)[0]
 
 
-def tripduration(duration):
+def trip_duration(duration):
     '''Converts trip duration to other units'''
     seconds = str(duration) + " s"
     minutes = str(float(duration) / 60) + " min"
     hours = str(float(duration) / 3600) + " h"
-    return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
-                         [0.3, 0.6, 0.9, 1])
+    return random_select([seconds, minutes, hours,
+                         str(random.randint(-1000, -1))],
+                         [0.3, 0.3, 0.3, 0.1])
 
 
 def station_name(name):
     '''Replaces '&' with '/' with a 50% chance'''
-    return choice([name, name.replace("&", "/")])
+    return random.choice([name, name.replace("&", "/")])
 
 
-def usertype(user):
+def user_type(user):
     '''Manipulates the user type string'''
-    return choice([user, user.upper(), user.lower(),
-                  "sub" if user == "Subscriber" else user,
-                   "cust" if user == "Customer" else user])
+    return random.choice([user, user.upper(), user.lower(),
+                          "sub" if user == "Subscriber" else user,
+                          "cust" if user == "Customer" else user])
 
 
 def gender(s):
     '''Manipulates the gender string'''
-    return choice([s, s.upper(), s.lower(),
-                  s[0] if len(s) > 0 else "",
-                   s[0].lower() if len(s) > 0 else ""])
+    return random.choice([s, s.upper(), s.lower(),
+                         s[0] if len(s) > 0 else "",
+                         s[0].lower() if len(s) > 0 else ""])
 
 
-def convertAngle(angle):
+def convert_angle(angle):
     '''Converts long and lat to DMS notation'''
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
     seconds = int((angle - degrees - minutes/60) * 3600)
     new_angle = str(degrees) + u"\u00B0" + \
         str(minutes) + "'" + str(seconds) + '"'
-    return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
+    return random_select([str(angle), new_angle], [0.55, 0.45])
 
 
 def dirty_data(proc_func, allow_none):
     '''Master function returns a user defined function
     that transforms the column data'''
     def udf(col_value):
-        seed(hash(col_value) + time_ns())
+        random.seed(hash(col_value) + time_ns())
         if col_value is None:
             return col_value
         elif allow_none:
             return random_select([None, proc_func(col_value)],
-                                 cum_weights=[0.05, 1])
+                                 [0.05, 0.95])
         else:
             return proc_func(col_value)
     return udf
@@ -101,19 +102,19 @@ def id(x):
 
 # Declare data transformations for each column in dataframe
 udfs = [
-    (dirty_data(tripduration, True), StringType()),  # tripduration
+    (dirty_data(trip_duration, True), StringType()),  # tripduration
     (dirty_data(id, True), StringType()),  # starttime
     (dirty_data(id, True), StringType()),  # stoptime
     (id, IntegerType()),  # start_station_id
     (dirty_data(station_name, False), StringType()),  # start_station_name
-    (dirty_data(convertAngle, True), StringType()),  # start_station_latitude
-    (dirty_data(convertAngle, True), StringType()),  # start_station_longitude
+    (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
+    (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
     (id, IntegerType()),  # end_station_id
     (dirty_data(station_name, False), StringType()),  # end_station_name
-    (dirty_data(convertAngle, True), StringType()),  # end_station_latitude
-    (dirty_data(convertAngle, True), StringType()),  # end_station_longitude
+    (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
+    (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
     (id, IntegerType()),  # bikeid
-    (dirty_data(usertype, False), StringType()),  # usertype
+    (dirty_data(user_type, False), StringType()),  # usertype
     (id, IntegerType()),  # birth_year
     (dirty_data(gender, False), StringType()),  # gender
     (id, StringType()),  # customer_plan
@@ -136,7 +137,7 @@ def id(x):
 if upload:
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = '{}.new_york_citibike_trips'.format(client.project)
+    dataset_id = f'{client.project}.new_york_citibike_trips'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
@@ -1,4 +1,7 @@
 # Submit a PySpark job via the Cloud Dataproc Jobs API
+# Requires having CLUSTER_NAME and BUCKET_NAME set as 
+# environment variables
+
 gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
@@ -11,62 +11,25 @@
 
 import pytest
 
-waiting_cluster_callback = False
 
 # Set global variables
-project = os.environ['GCLOUD_PROJECT']
-region = "us-central1"
-zone = "us-central1-a"
-cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
-bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
+PROJECT = os.environ['GCLOUD_PROJECT']
+REGION = "us-central1"
+ZONE = "us-central1-a"
+CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
+BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
 
+BUCKET = None
 
-@pytest.fixture(autouse=True)
-def teardown():
-    yield
-
-    # Delete cluster
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': f'{region}-dataproc.googleapis.com:443'
-    })
-
-    try:
-        operation = cluster_client.delete_cluster(project, region,
-                                                  cluster_name)
-        operation.result()
-    except GoogleAPICallError:
-        pass
-
-    # Delete GCS bucket
-    storage_client = storage.Client()
-    try:
-        bucket = storage_client.get_bucket(bucket_name)
-        bucket.delete(force=True)
-    except NotFound:
-        pass
-
-
-def test_setup(capsys):
-    '''Tests setup.py by submitting it to a dataproc cluster'''
-
-    # Create GCS Bucket
-    storage_client = storage.Client()
-    bucket = storage_client.create_bucket(bucket_name)
-
-    # Upload file
-    destination_blob_name = "setup.py"
-    blob = bucket.blob(destination_blob_name)
-    blob.upload_from_filename("setup.py")
-
-    job_file_name = "gs://" + bucket_name + "/setup.py"
 
+@pytest.fixture(autouse=True)
+def setup_and_teardown_cluster():
     # Create cluster configuration
     zone_uri = \
-        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
-            project, zone)
+        f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
     cluster_data = {
-        'project_id': project,
-        'cluster_name': cluster_name,
+        'project_id': PROJECT,
+        'cluster_name': CLUSTER_NAME,
         'config': {
             'gce_cluster_config': {
                 'zone_uri': zone_uri,
@@ -99,27 +62,59 @@ def test_setup(capsys):
 
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    cluster = cluster_client.create_cluster(project, region, cluster_data)
-    cluster.add_done_callback(callback)
+    operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
 
     # Wait for cluster to provision
-    global waiting_cluster_callback
-    waiting_cluster_callback = True
+    operation.result()
 
-    wait_for_cluster_creation()
+    yield
+
+    # Delete cluster
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
+    })
+
+    operation = cluster_client.delete_cluster(PROJECT, REGION,
+                                              CLUSTER_NAME)
+    operation.result()
+
+
+@pytest.fixture(autouse=True)
+def setup_and_teardown_bucket():
+    global BUCKET
+    # Create GCS Bucket
+    storage_client = storage.Client()
+    BUCKET = storage_client.create_bucket(BUCKET_NAME)
+
+    yield
+
+    # Delete GCS bucket
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(BUCKET_NAME)
+    bucket.delete(force=True)
+
+def test_setup(capsys):
+    '''Tests setup.py by submitting it to a dataproc cluster'''
+
+    # Upload file
+    destination_blob_name = "setup.py"
+    blob = BUCKET.blob(destination_blob_name)
+    blob.upload_from_filename("setup.py")
+
+    job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
 
     # Create job configuration
     job_details = {
         'placement': {
-            'cluster_name': cluster_name
+            'cluster_name': CLUSTER_NAME
         },
         'pyspark_job': {
             'main_python_file_uri': job_file_name,
             'args': [
-                bucket_name,
+                BUCKET_NAME,
                 "--test",
             ],
             "jar_file_uris": [
@@ -130,25 +125,21 @@ def test_setup(capsys):
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    result = job_client.submit_job(project_id=project, region=region,
+    response = job_client.submit_job(project_id=PROJECT, region=REGION,
                                    job=job_details)
 
-    job_id = result.reference.job_id
+    job_id = response.reference.job_id
     print('Submitted job \"{}\".'.format(job_id))
 
     # Wait for job to complete
-    wait_for_job(job_client, job_id)
+    result = response.add_done_callback(callback)
 
     # Get job output
-    cluster_info = cluster_client.get_cluster(project, region, cluster_name)
-    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
-    output_blob = (
-        'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
-        .format(cluster_info.cluster_uuid, job_id))
-    out = bucket.blob(output_blob).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri() + ".000000000"
+    output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -186,25 +177,5 @@ def test_setup(capsys):
     # Missing data
     assert "null" in out
 
-
 def callback(operation_future):
-    '''Sets a flag to stop waiting'''
-    global waiting_cluster_callback
-    waiting_cluster_callback = False
-
-
-def wait_for_cluster_creation():
-    '''Waits for cluster to create'''
-    while True:
-        if not waiting_cluster_callback:
-            break
-
-
-def wait_for_job(job_client, job_id):
-    '''Waits for job to finish'''
-    while True:
-        job = job_client.get_job(project, region, job_id)
-        assert job.status.State.Name(job.status.state) != "ERROR"
-
-        if job.status.State.Name(job.status.state) == "DONE":
-            return
+    return operation_future.result()