get dataproc job output and fix linting

vuppalli · vuppalli · commit 744f80c80516 · 2020-06-08T19:12:09.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,4 @@ credentials.dat
 .DS_store
 env/
 .idea
+data-science-onramp/data-ingestion/noxfile.py
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
@@ -12,7 +12,7 @@
 from pyspark.sql.types import IntegerType, StringType
 
 
-BUCKET_NAME = sys.argv[1] 
+BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
 
 
@@ -81,7 +81,8 @@ def udf(col_value):
 def id(x):
     return x
 
-def write_to_bigquery(df):
+
+def write_to_bigquery(spark, df):
     '''Write a dataframe to BigQuery'''
 
     # Create BigQuery Dataset
@@ -98,6 +99,7 @@ def write_to_bigquery(df):
         .option('table', dataset_id + ".RAW_DATA") \
         .save()
 
+
 def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
@@ -143,16 +145,16 @@ def main():
     new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
                          for udf, column, name in zip(udfs, df.columns, names)])
 
+    new_df.sample(False, 0.0001, seed=50).show(n=100)
+
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(False, 0.0001, seed=42)
+    dup_df = new_df.sample(True, 0.0001, seed=42)
 
     # Create final dirty dataframe
     df = new_df.union(dup_df)
-    df.sample(False, 0.0001, seed=50).show(n=200)
-    print("Dataframe sample printed")
 
     if upload:
-        write_to_bigquery(df)
+        write_to_bigquery(spark, df)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
@@ -3,11 +3,8 @@
 
 import uuid
 
-from google.api_core.exceptions import GoogleAPICallError
-
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
-from google.cloud.exceptions import NotFound
 
 import pytest
 
@@ -52,7 +49,7 @@ def setup_and_teardown_cluster():
                 }
             ],
             "software_config": {
-                "image_version": "1.4-debian10",
+                "image_version": "1.5.4-debian10",
                 "optional_components": [
                     "ANACONDA"
                 ],
@@ -96,6 +93,7 @@ def setup_and_teardown_bucket():
     bucket = storage_client.get_bucket(BUCKET_NAME)
     bucket.delete(force=True)
 
+
 def test_setup(capsys):
     '''Tests setup.py by submitting it to a dataproc cluster'''
 
@@ -129,22 +127,15 @@ def test_setup(capsys):
     })
 
     response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
-                                   job=job_details)
+                                                  job=job_details)
 
     # Wait for job to complete
     result = response.result()
 
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
-    })
-
-    cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME)
-
     # Get job output
-    output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000"
-    storage_client = storage.Client()
-    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
-    output = bucket.blob(output_location).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri + ".000000000"
+    blob = get_blob_from_path(output_location)
+    out = blob.download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -181,3 +172,10 @@ def test_setup(capsys):
 
     # Missing data
     assert "null" in out
+
+
+def get_blob_from_path(path):
+    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
+    bucket = storage.Client().get_bucket(bucket_name)
+    output_location = re.search("google-cloud-dataproc.+", path).group(0)
+    return bucket.blob(output_location)