add data ingestion code

vuppalli · vuppalli · commit 92cf76335d74 · 2020-06-05T15:35:13.000-04:00
diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt
@@ -0,0 +1 @@
+pytest==5.3.2
diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
@@ -0,0 +1,6 @@
+grpcio==1.29.0
+google-auth==1.16.0
+google-auth-httplib2==0.0.3
+google-cloud==0.34.0
+google-cloud-storage==1.28.1
+google-cloud-dataproc==0.8.0
diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup-test.py
@@ -0,0 +1,210 @@
+import os
+import re
+
+import uuid
+
+from google.api_core.exceptions import GoogleAPICallError
+
+from google.cloud import dataproc_v1 as dataproc
+from google.cloud import storage
+from google.cloud.exceptions import NotFound
+
+import pytest
+
+waiting_cluster_callback = False
+
+# Set global variables
+project = os.environ['GCLOUD_PROJECT']
+region = "us-central1"
+zone = "us-central1-a"
+cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
+bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
+
+
+@pytest.fixture(autouse=True)
+def teardown():
+    yield
+
+    # Delete cluster
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{region}-dataproc.googleapis.com:443'
+    })
+
+    try:
+        operation = cluster_client.delete_cluster(project, region,
+                                                  cluster_name)
+        operation.result()
+    except GoogleAPICallError:
+        pass
+
+    # Delete GCS bucket
+    storage_client = storage.Client()
+    try:
+        bucket = storage_client.get_bucket(bucket_name)
+        bucket.delete(force=True)
+    except NotFound:
+        pass
+
+
+def test_setup(capsys):
+    '''Tests setup.py by submitting it to a dataproc cluster'''
+
+    # Create GCS Bucket
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(bucket_name)
+
+    # Upload file
+    destination_blob_name = "setup.py"
+    blob = bucket.blob(destination_blob_name)
+    blob.upload_from_filename("setup.py")
+
+    job_file_name = "gs://" + bucket_name + "/setup.py"
+
+    # Create cluster configuration
+    zone_uri = \
+        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
+            project, zone)
+    cluster_data = {
+        'project_id': project,
+        'cluster_name': cluster_name,
+        'config': {
+            'gce_cluster_config': {
+                'zone_uri': zone_uri,
+                "metadata": {
+                    "PIP_PACKAGES": "google-cloud-storage"
+                },
+            },
+            'master_config': {
+                'num_instances': 1,
+                'machine_type_uri': 'n1-standard-8'
+            },
+            'worker_config': {
+                'num_instances': 6,
+                'machine_type_uri': 'n1-standard-8'
+            },
+            "initialization_actions": [
+                {
+                    "executable_file": ("gs://dataproc-initialization-actions/"
+                                        "python/pip-install.sh"),
+                }
+            ],
+            "software_config": {
+                "image_version": "1.5.4-debian10",
+                "optional_components": [
+                    "ANACONDA"
+                ],
+            }
+        }
+    }
+
+    # Create cluster using cluster client
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+    })
+
+    cluster = cluster_client.create_cluster(project, region, cluster_data)
+    cluster.add_done_callback(callback)
+
+    # Wait for cluster to provision
+    global waiting_cluster_callback
+    waiting_cluster_callback = True
+
+    wait_for_cluster_creation()
+
+    # Create job configuration
+    job_details = {
+        'placement': {
+            'cluster_name': cluster_name
+        },
+        'pyspark_job': {
+            'main_python_file_uri': job_file_name,
+            'args': [
+                bucket_name,
+                "--test",
+            ],
+            "jar_file_uris": [
+                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
+            ],
+        },
+    }
+
+    # Submit job to dataproc cluster
+    job_client = dataproc.JobControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+    })
+
+    result = job_client.submit_job(project_id=project, region=region,
+                                   job=job_details)
+
+    job_id = result.reference.job_id
+    print('Submitted job \"{}\".'.format(job_id))
+
+    # Wait for job to complete
+    wait_for_job(job_client, job_id)
+
+    # Get job output
+    cluster_info = cluster_client.get_cluster(project, region, cluster_name)
+    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
+    output_blob = (
+        'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
+        .format(cluster_info.cluster_uuid, job_id))
+    out = bucket.blob(output_blob).download_as_string().decode("utf-8")
+
+    # tripDuration
+    assert re.search("[0-9] s", out)
+    assert re.search("[0-9] m", out)
+    assert re.search("[0-9] h", out)
+
+    # station latitude & longitude
+    assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
+
+    # birth_year
+    assert re.search("19[0-9][0-9]\\|", out)
+    assert re.search("20[0-9][0-9]\\|", out)
+
+    # gender
+    assert "M" in out
+    assert "male" in out
+    assert "MALE" in out
+    assert "F" in out
+    assert "female" in out
+    assert "FEMALE" in out
+    assert "u" in out
+    assert "unknown" in out
+    assert "UNKNOWN" in out
+
+    # customer_plan
+    assert "Subscriber" in out
+    assert "subscriber" in out
+    assert "SUBSCRIBER" in out
+    assert "sub" in out
+    assert "Customer" in out
+    assert "customer" in out
+    assert "CUSTOMER" in out
+    assert "cust" in out
+
+    # Missing data
+    assert "null" in out
+
+
+def callback(operation_future):
+    '''Sets a flag to stop waiting'''
+    global waiting_cluster_callback
+    waiting_cluster_callback = False
+
+
+def wait_for_cluster_creation():
+    '''Waits for cluster to create'''
+    while True:
+        if not waiting_cluster_callback:
+            break
+
+
+def wait_for_job(job_client, job_id):
+    '''Waits for job to finish'''
+    while True:
+        job = job_client.get_job(project, region, job_id)
+        assert job.status.State.Name(job.status.state) != "ERROR"
+
+        if job.status.State.Name(job.status.state) == "DONE":
+            return
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
@@ -0,0 +1,149 @@
+from random import choice, choices, randint, seed
+import sys
+
+from time import time_ns
+
+from google.cloud import bigquery
+
+from py4j.protocol import Py4JJavaError
+from pyspark.sql import SparkSession
+
+from pyspark.sql.functions import UserDefinedFunction
+from pyspark.sql.types import IntegerType, StringType
+
+
+# Create a SparkSession under the name "setup". Viewable via the Spark UI
+spark = SparkSession.builder.appName("setup").getOrCreate()
+
+bucket_name = sys.argv[1]
+upload = True  # Whether to upload data to BigQuery
+
+# Check whether or not results should be uploaded
+try:
+    sys.argv[2]
+    upload = False
+except IndexError:
+    print("Results will be uploaded to BigQuery")
+
+table = "bigquery-public-data.new_york_citibike.citibike_trips"
+
+# Check if table exists
+try:
+    df = spark.read.format('bigquery').option('table', table).load()
+except Py4JJavaError:
+    print(f"{table} does not exist. ")
+    sys.exit(0)
+
+# START MAKING DATA DIRTY
+
+
+def random_select(items, cum_weights):
+    '''Picks an item according to the cumulative weights'''
+    return choices(items, cum_weights=cum_weights, k=1)[0]
+
+
+def tripduration(duration):
+    '''Converts trip duration to other units'''
+    seconds = str(duration) + " s"
+    minutes = str(float(duration) / 60) + " min"
+    hours = str(float(duration) / 3600) + " h"
+    return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
+                         [0.3, 0.6, 0.9, 1])
+
+
+def station_name(name):
+    '''Replaces '&' with '/' with a 50% chance'''
+    return choice([name, name.replace("&", "/")])
+
+
+def usertype(user):
+    '''Manipulates the user type string'''
+    return choice([user, user.upper(), user.lower(),
+                  "sub" if user == "Subscriber" else user,
+                   "cust" if user == "Customer" else user])
+
+
+def gender(s):
+    '''Manipulates the gender string'''
+    return choice([s, s.upper(), s.lower(),
+                  s[0] if len(s) > 0 else "",
+                   s[0].lower() if len(s) > 0 else ""])
+
+
+def convertAngle(angle):
+    '''Converts long and lat to DMS notation'''
+    degrees = int(angle)
+    minutes = int((angle - degrees) * 60)
+    seconds = int((angle - degrees - minutes/60) * 3600)
+    new_angle = str(degrees) + u"\u00B0" + \
+        str(minutes) + "'" + str(seconds) + '"'
+    return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
+
+
+def dirty_data(proc_func, allow_none):
+    '''Master function returns a user defined function
+    that transforms the column data'''
+    def udf(col_value):
+        seed(hash(col_value) + time_ns())
+        if col_value is None:
+            return col_value
+        elif allow_none:
+            return random_select([None, proc_func(col_value)],
+                                 cum_weights=[0.05, 1])
+        else:
+            return proc_func(col_value)
+    return udf
+
+
+def id(x):
+    return x
+
+
+# Declare data transformations for each column in dataframe
+udfs = [
+    (dirty_data(tripduration, True), StringType()),  # tripduration
+    (dirty_data(id, True), StringType()),  # starttime
+    (dirty_data(id, True), StringType()),  # stoptime
+    (id, IntegerType()),  # start_station_id
+    (dirty_data(station_name, False), StringType()),  # start_station_name
+    (dirty_data(convertAngle, True), StringType()),  # start_station_latitude
+    (dirty_data(convertAngle, True), StringType()),  # start_station_longitude
+    (id, IntegerType()),  # end_station_id
+    (dirty_data(station_name, False), StringType()),  # end_station_name
+    (dirty_data(convertAngle, True), StringType()),  # end_station_latitude
+    (dirty_data(convertAngle, True), StringType()),  # end_station_longitude
+    (id, IntegerType()),  # bikeid
+    (dirty_data(usertype, False), StringType()),  # usertype
+    (id, IntegerType()),  # birth_year
+    (dirty_data(gender, False), StringType()),  # gender
+    (id, StringType()),  # customer_plan
+]
+
+# Apply dirty transformations to df
+names = df.schema.names
+new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
+                     for udf, column, name in zip(udfs, df.columns, names)])
+
+# Duplicate about 0.01% of the rows
+dup_df = new_df.sample(False, 0.0001, seed=42)
+
+# Create final dirty dataframe
+df = new_df.union(dup_df)
+df.sample(False, 0.0001, seed=50).show(n=200)
+print("Dataframe sample printed")
+
+# Write to BigQuery
+if upload:
+    # Create BigQuery Dataset
+    client = bigquery.Client()
+    dataset_id = '{}.new_york_citibike_trips'.format(client.project)
+    dataset = bigquery.Dataset(dataset_id)
+    dataset.location = "US"
+    dataset = client.create_dataset(dataset)
+
+    # Saving the data to BigQuery
+    spark.conf.set('temporaryGcsBucket', bucket_name)
+
+    df.write.format('bigquery') \
+        .option('table', dataset_id + ".RAW_DATA") \
+        .save()
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
@@ -0,0 +1,6 @@
+# Submit a PySpark job via the Cloud Dataproc Jobs API
+gcloud dataproc jobs submit pyspark \
+    --cluster ${CLUSTER_NAME} \
+    --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
+    --driver-log-levels root=FATAL \
+    setup.py -- ${BUCKET_NAME}