Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 4afbf1c

Browse files
committed
address code structure and global variable issues
1 parent 681eaf3 commit 4afbf1c

File tree

3 files changed

+78
-289
lines changed

3 files changed

+78
-289
lines changed

data-science-onramp/data-ingestion/noxfile.py

Lines changed: 0 additions & 225 deletions
This file was deleted.

data-science-onramp/data-ingestion/setup.py

Lines changed: 67 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -12,31 +12,11 @@
1212
from pyspark.sql.types import IntegerType, StringType
1313

1414

15-
# Create a SparkSession under the name "setup". Viewable via the Spark UI
16-
spark = SparkSession.builder.appName("setup").getOrCreate()
15+
BUCKET_NAME = sys.argv[1]
16+
TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
1717

18-
bucket_name = sys.argv[1]
19-
upload = True # Whether to upload data to BigQuery
20-
21-
# Check whether or not results should be uploaded
22-
if len(sys.arv) > 1:
23-
upload = False
24-
print("Not uploading results to BigQuery")
25-
else:
26-
print("Results will be uploaded to BigQuery")
27-
28-
table = "bigquery-public-data.new_york_citibike.citibike_trips"
29-
30-
# Check if table exists
31-
try:
32-
df = spark.read.format('bigquery').option('table', table).load()
33-
except Py4JJavaError:
34-
print(f"{table} does not exist. ")
35-
sys.exit(0)
3618

3719
# START MAKING DATA DIRTY
38-
39-
4020
def random_select(items, weights):
4121
'''Picks an item according to the cumulative weights'''
4222
return random.choices(items, weights=weights, k=1)[0]
@@ -81,6 +61,8 @@ def convert_angle(angle):
8161
return random_select([str(angle), new_angle], [0.55, 0.45])
8262

8363

64+
# This function is nested since a UserDefinedFunction is
65+
# expected to take a single argument
8466
def dirty_data(proc_func, allow_none):
8567
'''Master function returns a user defined function
8668
that transforms the column data'''
@@ -99,42 +81,9 @@ def udf(col_value):
9981
def id(x):
10082
return x
10183

84+
def write_to_bigquery(df):
85+
'''Write a dataframe to BigQuery'''
10286

103-
# Declare data transformations for each column in dataframe
104-
udfs = [
105-
(dirty_data(trip_duration, True), StringType()), # tripduration
106-
(dirty_data(id, True), StringType()), # starttime
107-
(dirty_data(id, True), StringType()), # stoptime
108-
(id, IntegerType()), # start_station_id
109-
(dirty_data(station_name, False), StringType()), # start_station_name
110-
(dirty_data(convert_angle, True), StringType()), # start_station_latitude
111-
(dirty_data(convert_angle, True), StringType()), # start_station_longitude
112-
(id, IntegerType()), # end_station_id
113-
(dirty_data(station_name, False), StringType()), # end_station_name
114-
(dirty_data(convert_angle, True), StringType()), # end_station_latitude
115-
(dirty_data(convert_angle, True), StringType()), # end_station_longitude
116-
(id, IntegerType()), # bikeid
117-
(dirty_data(user_type, False), StringType()), # usertype
118-
(id, IntegerType()), # birth_year
119-
(dirty_data(gender, False), StringType()), # gender
120-
(id, StringType()), # customer_plan
121-
]
122-
123-
# Apply dirty transformations to df
124-
names = df.schema.names
125-
new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
126-
for udf, column, name in zip(udfs, df.columns, names)])
127-
128-
# Duplicate about 0.01% of the rows
129-
dup_df = new_df.sample(False, 0.0001, seed=42)
130-
131-
# Create final dirty dataframe
132-
df = new_df.union(dup_df)
133-
df.sample(False, 0.0001, seed=50).show(n=200)
134-
print("Dataframe sample printed")
135-
136-
# Write to BigQuery
137-
if upload:
13887
# Create BigQuery Dataset
13988
client = bigquery.Client()
14089
dataset_id = f'{client.project}.new_york_citibike_trips'
@@ -143,8 +92,68 @@ def id(x):
14392
dataset = client.create_dataset(dataset)
14493

14594
# Saving the data to BigQuery
146-
spark.conf.set('temporaryGcsBucket', bucket_name)
95+
spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
14796

14897
df.write.format('bigquery') \
14998
.option('table', dataset_id + ".RAW_DATA") \
15099
.save()
100+
101+
def main():
102+
# Create a SparkSession under the name "setup". Viewable via the Spark UI
103+
spark = SparkSession.builder.appName("setup").getOrCreate()
104+
105+
upload = True # Whether to upload data to BigQuery
106+
107+
# Check whether or not results should be uploaded
108+
if len(sys.argv) > 1:
109+
upload = False
110+
print("Not uploading results to BigQuery")
111+
else:
112+
print("Results will be uploaded to BigQuery")
113+
114+
# Check if table exists
115+
try:
116+
df = spark.read.format('bigquery').option('table', TABLE).load()
117+
except Py4JJavaError:
118+
print(f"{TABLE} does not exist. ")
119+
sys.exit(0)
120+
121+
# Declare data transformations for each column in dataframe
122+
udfs = [
123+
(dirty_data(trip_duration, True), StringType()), # tripduration
124+
(dirty_data(id, True), StringType()), # starttime
125+
(dirty_data(id, True), StringType()), # stoptime
126+
(id, IntegerType()), # start_station_id
127+
(dirty_data(station_name, False), StringType()), # start_station_name
128+
(dirty_data(convert_angle, True), StringType()), # start_station_latitude
129+
(dirty_data(convert_angle, True), StringType()), # start_station_longitude
130+
(id, IntegerType()), # end_station_id
131+
(dirty_data(station_name, False), StringType()), # end_station_name
132+
(dirty_data(convert_angle, True), StringType()), # end_station_latitude
133+
(dirty_data(convert_angle, True), StringType()), # end_station_longitude
134+
(id, IntegerType()), # bikeid
135+
(dirty_data(user_type, False), StringType()), # usertype
136+
(id, IntegerType()), # birth_year
137+
(dirty_data(gender, False), StringType()), # gender
138+
(id, StringType()), # customer_plan
139+
]
140+
141+
# Apply dirty transformations to df
142+
names = df.schema.names
143+
new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
144+
for udf, column, name in zip(udfs, df.columns, names)])
145+
146+
# Duplicate about 0.01% of the rows
147+
dup_df = new_df.sample(False, 0.0001, seed=42)
148+
149+
# Create final dirty dataframe
150+
df = new_df.union(dup_df)
151+
df.sample(False, 0.0001, seed=50).show(n=200)
152+
print("Dataframe sample printed")
153+
154+
if upload:
155+
write_to_bigquery(df)
156+
157+
158+
if __name__ == '__main__':
159+
main()

0 commit comments

Comments
 (0)