Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3e86bda

Browse files
committed
address Brad PR comments
1 parent 81265d2 commit 3e86bda

File tree

2 files changed

+12
-25
lines changed

2 files changed

+12
-25
lines changed

data-science-onramp/data-ingestion/setup.py

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
import random
22
import sys
3-
43
from time import time_ns
54

65
from google.cloud import bigquery
7-
86
from py4j.protocol import Py4JJavaError
97
from pyspark.sql import SparkSession
10-
118
from pyspark.sql.functions import UserDefinedFunction
129
from pyspark.sql.types import IntegerType, StringType
1310

@@ -56,7 +53,7 @@ def convert_angle(angle):
5653
degrees = int(angle)
5754
minutes = int((angle - degrees) * 60)
5855
seconds = int((angle - degrees - minutes/60) * 3600)
59-
new_angle = str(degrees) + u"\u00B0" + \
56+
new_angle = str(degrees) + "\u00B0" + \
6057
str(minutes) + "'" + str(seconds) + '"'
6158
return random_select([str(angle), new_angle], [0.55, 0.45])
6259

@@ -78,13 +75,7 @@ def udf(col_value):
7875
return udf
7976

8077

81-
# This function is required because we need to apply a
82-
# function for every column and some columns do not change
83-
def identity(x):
84-
return x
85-
86-
87-
def write_to_bigquery(spark, df):
78+
def write_to_bigquery(df):
8879
'''Write a dataframe to BigQuery'''
8980

9081
# Create BigQuery Dataset
@@ -95,10 +86,9 @@ def write_to_bigquery(spark, df):
9586
dataset = client.create_dataset(dataset)
9687

9788
# Saving the data to BigQuery
98-
spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
99-
10089
df.write.format('bigquery') \
10190
.option('table', dataset_id + ".RAW_DATA") \
91+
.option("temporaryGcsBucket", BUCKET_NAME) \
10292
.save()
10393

10494

@@ -109,7 +99,7 @@ def main():
10999
upload = True # Whether to upload data to BigQuery
110100

111101
# Check whether or not results should be uploaded
112-
if len(sys.argv) > 1:
102+
if len(sys.argv) > 2:
113103
upload = False
114104
print("Not uploading results to BigQuery")
115105
else:
@@ -125,21 +115,21 @@ def main():
125115
# Declare data transformations for each column in dataframe
126116
udfs = [
127117
(dirty_data(trip_duration, True), StringType()), # tripduration
128-
(dirty_data(identity, True), StringType()), # starttime
129-
(dirty_data(identity, True), StringType()), # stoptime
130-
(identity, IntegerType()), # start_station_id
118+
(dirty_data(lambda x: x, True), StringType()), # starttime
119+
(dirty_data(lambda x: x, True), StringType()), # stoptime
120+
(lambda x: x, IntegerType()), # start_station_id
131121
(dirty_data(station_name, False), StringType()), # start_station_name
132122
(dirty_data(convert_angle, True), StringType()), # start_station_latitude
133123
(dirty_data(convert_angle, True), StringType()), # start_station_longitude
134-
(identity, IntegerType()), # end_station_id
124+
(lambda x: x, IntegerType()), # end_station_id
135125
(dirty_data(station_name, False), StringType()), # end_station_name
136126
(dirty_data(convert_angle, True), StringType()), # end_station_latitude
137127
(dirty_data(convert_angle, True), StringType()), # end_station_longitude
138-
(identity, IntegerType()), # bikeid
128+
(lambda x: x, IntegerType()), # bikeid
139129
(dirty_data(user_type, False), StringType()), # usertype
140-
(identity, IntegerType()), # birth_year
130+
(lambda x: x, IntegerType()), # birth_year
141131
(dirty_data(gender, False), StringType()), # gender
142-
(identity, StringType()), # customer_plan
132+
(lambda x: x, StringType()), # customer_plan
143133
]
144134

145135
# Apply dirty transformations to df
@@ -156,7 +146,7 @@ def main():
156146
df = new_df.union(dup_df)
157147

158148
if upload:
159-
write_to_bigquery(spark, df)
149+
write_to_bigquery(df)
160150

161151

162152
if __name__ == '__main__':

data-science-onramp/data-ingestion/setup_test.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
import os
22
import re
3-
43
import uuid
54

65
from google.cloud import dataproc_v1 as dataproc
76
from google.cloud import storage
8-
97
import pytest
108

11-
129
# Set global variables
1310
PROJECT = os.environ['GCLOUD_PROJECT']
1411
REGION = "us-central1"

0 commit comments

Comments
 (0)