fix PR comments

vuppalli · vuppalli · commit 8cd7dc611e87 · 2020-06-09T15:32:02.000-04:00
diff --git a/.gitignore b/.gitignore
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
@@ -46,8 +46,8 @@ def user_type(user):
 
 def gender(s):
     '''Manipulates the gender string'''
-    return random.choice([s, s.upper(), s.lower(),
-                         s[0] if len(s) > 0 else "",
+    return random.choice([s.upper(), s.lower(),
+                         s[0].upper() if len(s) > 0 else "",
                          s[0].lower() if len(s) > 0 else ""])
 
 
@@ -78,7 +78,9 @@ def udf(col_value):
     return udf
 
 
-def id(x):
+# This function is required because we need to apply a
+# function for every column and some columns do not change
+def identity(x):
     return x
 
 
@@ -118,37 +120,37 @@ def main():
         df = spark.read.format('bigquery').option('table', TABLE).load()
     except Py4JJavaError:
         print(f"{TABLE} does not exist. ")
-        sys.exit(0)
+        return
 
     # Declare data transformations for each column in dataframe
     udfs = [
         (dirty_data(trip_duration, True), StringType()),  # tripduration
-        (dirty_data(id, True), StringType()),  # starttime
-        (dirty_data(id, True), StringType()),  # stoptime
-        (id, IntegerType()),  # start_station_id
+        (dirty_data(identity, True), StringType()),  # starttime
+        (dirty_data(identity, True), StringType()),  # stoptime
+        (identity, IntegerType()),  # start_station_id
         (dirty_data(station_name, False), StringType()),  # start_station_name
         (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-        (id, IntegerType()),  # end_station_id
+        (identity, IntegerType()),  # end_station_id
         (dirty_data(station_name, False), StringType()),  # end_station_name
         (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-        (id, IntegerType()),  # bikeid
+        (identity, IntegerType()),  # bikeid
         (dirty_data(user_type, False), StringType()),  # usertype
-        (id, IntegerType()),  # birth_year
+        (identity, IntegerType()),  # birth_year
         (dirty_data(gender, False), StringType()),  # gender
-        (id, StringType()),  # customer_plan
+        (identity, StringType()),  # customer_plan
     ]
 
     # Apply dirty transformations to df
     names = df.schema.names
     new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
                          for udf, column, name in zip(udfs, df.columns, names)])
 
-    new_df.sample(False, 0.0001, seed=50).show(n=100)
+    new_df.sample(False, 0.0001).show(n=100)
 
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(True, 0.0001, seed=42)
+    dup_df = new_df.sample(True, 0.0001)
 
     # Create final dirty dataframe
     df = new_df.union(dup_df)
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
@@ -12,24 +12,36 @@
 # Set global variables
 PROJECT = os.environ['GCLOUD_PROJECT']
 REGION = "us-central1"
-ZONE = "us-central1-a"
 CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
 BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
-
-BUCKET = None
+DESTINATION_BLOB_NAME = "setup.py"
+JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
+JOB_DETAILS = {  # Job configuration
+    'placement': {
+        'cluster_name': CLUSTER_NAME
+    },
+    'pyspark_job': {
+        'main_python_file_uri': JOB_FILE_NAME,
+        'args': [
+            BUCKET_NAME,
+            "--test",
+        ],
+        "jar_file_uris": [
+                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
+        ],
+    },
+}
 
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_cluster():
     # Create cluster configuration
-    zone_uri = \
-        f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
     cluster_data = {
         'project_id': PROJECT,
         'cluster_name': CLUSTER_NAME,
         'config': {
             'gce_cluster_config': {
-                'zone_uri': zone_uri,
+                'zone_uri': '',
                 "metadata": {
                     "PIP_PACKAGES": "google-cloud-storage"
                 },
@@ -59,9 +71,8 @@ def setup_and_teardown_cluster():
 
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-
     operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
 
     # Wait for cluster to provision
@@ -70,64 +81,48 @@ def setup_and_teardown_cluster():
     yield
 
     # Delete cluster
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
-    })
-
     operation = cluster_client.delete_cluster(PROJECT, REGION,
                                               CLUSTER_NAME)
     operation.result()
 
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_bucket():
-    global BUCKET
     # Create GCS Bucket
     storage_client = storage.Client()
-    BUCKET = storage_client.create_bucket(BUCKET_NAME)
+    bucket = storage_client.create_bucket(BUCKET_NAME)
+
+    # Upload file
+    blob = bucket.blob(DESTINATION_BLOB_NAME)
+    blob.upload_from_filename("setup.py")
 
     yield
 
     # Delete GCS bucket
-    storage_client = storage.Client()
     bucket = storage_client.get_bucket(BUCKET_NAME)
     bucket.delete(force=True)
 
 
-def test_setup(capsys):
-    '''Tests setup.py by submitting it to a dataproc cluster'''
+def get_blob_from_path(path):
+    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
+    bucket = storage.Client().get_bucket(bucket_name)
+    output_location = re.search("google-cloud-dataproc.+", path).group(0)
+    return bucket.blob(output_location)
 
-    # Upload file
-    destination_blob_name = "setup.py"
-    blob = BUCKET.blob(destination_blob_name)
-    blob.upload_from_filename("setup.py")
 
-    job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
-
-    # Create job configuration
-    job_details = {
-        'placement': {
-            'cluster_name': CLUSTER_NAME
-        },
-        'pyspark_job': {
-            'main_python_file_uri': job_file_name,
-            'args': [
-                BUCKET_NAME,
-                "--test",
-            ],
-            "jar_file_uris": [
-                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
-            ],
-        },
-    }
+def is_in_table(value, out):
+    return re.search(f"\| *{value}\|", out)
+
+
+def test_setup():
+    '''Tests setup.py by submitting it to a dataproc cluster'''
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-
     response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
-                                                  job=job_details)
+                                                  job=JOB_DETAILS)
 
     # Wait for job to complete
     result = response.result()
@@ -150,32 +145,28 @@ def test_setup(capsys):
     assert re.search("20[0-9][0-9]\\|", out)
 
     # gender
-    assert "M" in out
-    assert "male" in out
-    assert "MALE" in out
-    assert "F" in out
-    assert "female" in out
-    assert "FEMALE" in out
-    assert "u" in out
-    assert "unknown" in out
-    assert "UNKNOWN" in out
+    assert is_in_table("M", out)
+    assert is_in_table("m", out)
+    assert is_in_table("male", out)
+    assert is_in_table("MALE", out)
+    assert is_in_table("F", out)
+    assert is_in_table("f", out)
+    assert is_in_table("female", out)
+    assert is_in_table("FEMALE", out)
+    assert is_in_table("U", out)
+    assert is_in_table("u", out)
+    assert is_in_table("unknown", out)
+    assert is_in_table("UNKNOWN", out)
 
     # customer_plan
-    assert "Subscriber" in out
-    assert "subscriber" in out
-    assert "SUBSCRIBER" in out
-    assert "sub" in out
-    assert "Customer" in out
-    assert "customer" in out
-    assert "CUSTOMER" in out
-    assert "cust" in out
+    assert is_in_table("Subscriber", out)
+    assert is_in_table("subscriber", out)
+    assert is_in_table("SUBSCRIBER", out)
+    assert is_in_table("sub", out)
+    assert is_in_table("Customer", out)
+    assert is_in_table("customer", out)
+    assert is_in_table("CUSTOMER", out)
+    assert is_in_table("cust", out)
 
     # Missing data
-    assert "null" in out
-
-
-def get_blob_from_path(path):
-    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
-    bucket = storage.Client().get_bucket(bucket_name)
-    output_location = re.search("google-cloud-dataproc.+", path).group(0)
-    return bucket.blob(output_location)
+    assert is_in_table("null", out)