Data science onramp Data ingestion (GoogleCloudPlatform#4447)

Symmetries · vuppalli · tk744 · web-flow · commit ca4e6a466bc4 · 2020-08-14T11:11:12.000-07:00
* add data ingestion code * begin addressing comments * change submit job * address code structure and global variable issues * get dataproc job output and fix linting * fix PR comments * linting and global vars * address Brad PR comments * broken clean.py * Revert "broken clean.py" This reverts commit 580c8e1. * optimize data ingestion * fix linting errors * fix minor style issues * remove pip from cluster config * load external datasets from url * add citibike dataset notebook * address leahs comments * add gas dataset code * add scikit learn * rename file * small wording change * address brad and diego comments * fix linting issues * add outputs * add US holidays feature engineering * Delete noxfile.py * minor changes * change output * added dry-run flag * address leah comments * address brads comments * add weather feature engineering * dry-run flag * normalize weather values * address some review comments * address comments from live code review * add env var support and upload to gcs bucket * change import order and clear incorrect output * optimize setup test * query data in test * address live session comments * add break statement * revert breaking table and dataset name change * small cleanup * more cleanup * fix incorrect outputs * Data cleaning script * addressed PR comments 1 * addressed PR comments 2 * Refactored to match tutorial doc * added testing files * added sh script * added dry-run flag * changed --test flag to --dry-run * gcs upload now writes to temp location before copying objects to final location * fixed linting * linting fixes * Revert "Dataset Feature Engineering" * fix datetime formatting in setup job * uncomment commented dataset creation and writing * add data ingestion code * begin addressing comments * change submit job * address code structure and global variable issues * get dataproc job output and fix linting * fix PR comments * linting and global vars * address Brad PR comments * broken clean.py * Revert "broken clean.py" This reverts commit 580c8e1. * optimize data ingestion * fix linting errors * fix minor style issues * remove pip from cluster config * load external datasets from url * added dry-run flag * dry-run flag * address some review comments * optimize setup test * query data in test * address live session comments * add break statement * revert breaking table and dataset name change * fix datetime formatting in setup job * uncomment commented dataset creation and writing * fix import order * use GOOGLE_CLOUD_PROJECT environment variable * blacken and add f-strings to dms notation * change test variables names to match data cleaning * blacken setup_test file * fix unchanged variable name * WIP: address PR comments * apply temporary fix for ANACONDA optional component * remove data cleaning files Co-authored-by: vuppalli <vu8hh@virginia.edu> Co-authored-by: Tushar Khan <tusharkhan@google.com> Co-authored-by: Vismita Uppalli <32617204+vuppalli@users.noreply.github.com> Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> Co-authored-by: Tushar Khan <tushar.54k@gmail.com>
diff --git a/.gitignore b/.gitignore
diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt
@@ -0,0 +1 @@
+pytest==6.0.0
diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
@@ -0,0 +1,6 @@
+#grpcio==1.29.0
+#google-auth==1.16.0
+#google-auth-httplib2==0.0.3
+google-cloud-storage==1.28.1
+google-cloud-dataproc==2.0.0
+google-cloud-bigquery==1.25.0
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
@@ -0,0 +1,211 @@
+"""Setup Dataproc job for Data Science Onramp Sample Application
+This job ingests an external gas prices in NY dataset as well as
+takes a New York Citibike dataset available on BigQuery and
+"dirties" the dataset before uploading it back to BigQuery
+It needs the following arguments
+* the name of the Google Cloud Storage bucket to be used
+* the name of the BigQuery dataset to be created
+* an optional --test flag to upload a subset of the dataset for testing
+"""
+
+import random
+import sys
+
+from google.cloud import bigquery
+import pandas as pd
+from py4j.protocol import Py4JJavaError
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import date_format, expr, UserDefinedFunction, when
+from pyspark.sql.types import FloatType, StringType, StructField, StructType
+
+TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
+CITIBIKE_TABLE_NAME = "RAW_DATA"
+EXTERNAL_TABLES = {
+    "gas_prices": {
+        "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
+        "schema": StructType(
+            [
+                StructField("Date", StringType(), True),
+                StructField("New_York_State_Average_USD_per_Gal", FloatType(), True),
+                StructField("Albany_Average_USD_per_Gal", FloatType(), True),
+                StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True),
+                StructField("Buffalo_Average_USD_per_Gal", FloatType(), True),
+                StructField("Nassau_Average_USD_per_Gal", FloatType(), True),
+                StructField("New_York_City_Average_USD_per_Gal", FloatType(), True),
+                StructField("Rochester_Average_USD_per_Gal", FloatType(), True),
+                StructField("Syracuse_Average_USD_per_Gal", FloatType(), True),
+                StructField("Utica_Average_USD_per_Gal", FloatType(), True),
+            ]
+        ),
+    },
+}
+
+
+# START MAKING DATA DIRTY
+def trip_duration(duration):
+    """Converts trip duration to other units"""
+    if not duration:
+        return None
+    seconds = f"{str(duration)} s"
+    minutes = f"{str(float(duration) / 60)} min"
+    hours = f"{str(float(duration) / 3600)} h"
+    return random.choices(
+        [seconds, minutes, hours, str(random.randint(-1000, -1))],
+        weights=[0.3, 0.3, 0.3, 0.1],
+    )[0]
+
+
+def station_name(name):
+    """Replaces '&' with '/' with a 50% chance"""
+    if not name:
+        return None
+    return random.choice([name, name.replace("&", "/")])
+
+
+def user_type(user):
+    """Manipulates the user type string"""
+    if not user:
+        return None
+    return random.choice(
+        [
+            user,
+            user.upper(),
+            user.lower(),
+            "sub" if user == "Subscriber" else user,
+            "cust" if user == "Customer" else user,
+        ]
+    )
+
+
+def gender(s):
+    """Manipulates the gender string"""
+    if not s:
+        return None
+    return random.choice(
+        [
+            s.upper(),
+            s.lower(),
+            s[0].upper() if len(s) > 0 else "",
+            s[0].lower() if len(s) > 0 else "",
+        ]
+    )
+
+
+def convert_angle(angle):
+    """Converts long and lat to DMS notation"""
+    if not angle:
+        return None
+    degrees = int(angle)
+    minutes = int((angle - degrees) * 60)
+    seconds = int((angle - degrees - minutes / 60) * 3600)
+    new_angle = f"{degrees}\u00B0{minutes}'{seconds}\""
+    return random.choices([str(angle), new_angle], weights=[0.55, 0.45])[0]
+
+
+def create_bigquery_dataset(dataset_name):
+    # Create BigQuery Dataset
+    client = bigquery.Client()
+    dataset_id = f"{client.project}.{dataset_name}"
+    dataset = bigquery.Dataset(dataset_id)
+    dataset.location = "US"
+    dataset = client.create_dataset(dataset)
+
+
+def write_to_bigquery(df, table_name, dataset_name):
+    """Write a dataframe to BigQuery"""
+    client = bigquery.Client()
+    dataset_id = f"{client.project}.{dataset_name}"
+
+    # Saving the data to BigQuery
+    df.write.format("bigquery").option("table", f"{dataset_id}.{table_name}").save()
+
+    print(f"Table {table_name} successfully written to BigQuery")
+
+
+def main():
+    # Get command line arguments
+    BUCKET_NAME = sys.argv[1]
+    DATASET_NAME = sys.argv[2]
+
+    # Create a SparkSession under the name "setup"
+    spark = SparkSession.builder.appName("setup").getOrCreate()
+
+    spark.conf.set("temporaryGcsBucket", BUCKET_NAME)
+
+    create_bigquery_dataset(DATASET_NAME)
+
+    # Whether we are running the job as a test
+    test = False
+
+    # Check whether or not the job is running as a test
+    if "--test" in sys.argv:
+        test = True
+        print("A subset of the whole dataset will be uploaded to BigQuery")
+    else:
+        print("Results will be uploaded to BigQuery")
+
+    # Ingest External Datasets
+    for table_name, data in EXTERNAL_TABLES.items():
+        df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"])
+
+        write_to_bigquery(df, table_name, DATASET_NAME)
+
+    # Check if table exists
+    try:
+        df = spark.read.format("bigquery").option("table", TABLE).load()
+        # if we are running a test, perform computations on a subset of the data
+        if test:
+            df = df.sample(False, 0.00001)
+    except Py4JJavaError:
+        print(f"{TABLE} does not exist. ")
+        return
+
+    # Declare dictionary with keys column names and values user defined
+    # functions and return types
+    udf_map = {
+        "tripduration": (trip_duration, StringType()),
+        "start_station_name": (station_name, StringType()),
+        "start_station_latitude": (convert_angle, StringType()),
+        "start_station_longitude": (convert_angle, StringType()),
+        "end_station_name": (station_name, StringType()),
+        "end_station_latitude": (convert_angle, StringType()),
+        "end_station_longitude": (convert_angle, StringType()),
+        "usertype": (user_type, StringType()),
+        "gender": (gender, StringType()),
+    }
+
+    # Declare which columns to set some values to null randomly
+    null_columns = [
+        "tripduration",
+        "starttime",
+        "stoptime",
+        "start_station_latitude",
+        "start_station_longitude",
+        "end_station_latitude",
+        "end_station_longitude",
+    ]
+
+    # Dirty the columns
+    for name, udf in udf_map.items():
+        df = df.withColumn(name, UserDefinedFunction(*udf)(name))
+
+    # Format the datetimes correctly
+    for name in ["starttime", "stoptime"]:
+        df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss"))
+
+    # Randomly set about 5% of the values in some columns to null
+    for name in null_columns:
+        df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))
+
+    # Duplicate about 0.01% of the rows
+    dup_df = df.sample(True, 0.0001)
+
+    # Create final dirty dataframe
+    df = df.union(dup_df)
+
+    print("Uploading citibike dataset...")
+    write_to_bigquery(df, CITIBIKE_TABLE_NAME, DATASET_NAME)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
@@ -0,0 +1,9 @@
+# Submit a PySpark job via the Cloud Dataproc Jobs API
+# Requires having CLUSTER_NAME and BUCKET_NAME set as 
+# environment variables
+
+gcloud dataproc jobs submit pyspark \
+    --cluster ${CLUSTER_NAME} \
+    --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
+    --driver-log-levels root=FATAL \
+    setup.py -- ${BUCKET_NAME} new_york_citibike_trips
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py