12
12
from pyspark .sql .types import IntegerType , StringType
13
13
14
14
15
- # Create a SparkSession under the name "setup". Viewable via the Spark UI
16
- spark = SparkSession . builder . appName ( "setup" ). getOrCreate ()
15
+ BUCKET_NAME = sys . argv [ 1 ]
16
+ TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
17
17
18
- bucket_name = sys .argv [1 ]
19
- upload = True # Whether to upload data to BigQuery
20
-
21
- # Check whether or not results should be uploaded
22
- if len (sys .arv ) > 1 :
23
- upload = False
24
- print ("Not uploading results to BigQuery" )
25
- else :
26
- print ("Results will be uploaded to BigQuery" )
27
-
28
- table = "bigquery-public-data.new_york_citibike.citibike_trips"
29
-
30
- # Check if table exists
31
- try :
32
- df = spark .read .format ('bigquery' ).option ('table' , table ).load ()
33
- except Py4JJavaError :
34
- print (f"{ table } does not exist. " )
35
- sys .exit (0 )
36
18
37
19
# START MAKING DATA DIRTY
38
-
39
-
40
20
def random_select (items , weights ):
41
21
'''Picks an item according to the cumulative weights'''
42
22
return random .choices (items , weights = weights , k = 1 )[0 ]
@@ -81,6 +61,8 @@ def convert_angle(angle):
81
61
return random_select ([str (angle ), new_angle ], [0.55 , 0.45 ])
82
62
83
63
64
+ # This function is nested since a UserDefinedFunction is
65
+ # expected to take a single argument
84
66
def dirty_data (proc_func , allow_none ):
85
67
'''Master function returns a user defined function
86
68
that transforms the column data'''
@@ -99,42 +81,9 @@ def udf(col_value):
99
81
def id (x ):
100
82
return x
101
83
84
+ def write_to_bigquery (df ):
85
+ '''Write a dataframe to BigQuery'''
102
86
103
- # Declare data transformations for each column in dataframe
104
- udfs = [
105
- (dirty_data (trip_duration , True ), StringType ()), # tripduration
106
- (dirty_data (id , True ), StringType ()), # starttime
107
- (dirty_data (id , True ), StringType ()), # stoptime
108
- (id , IntegerType ()), # start_station_id
109
- (dirty_data (station_name , False ), StringType ()), # start_station_name
110
- (dirty_data (convert_angle , True ), StringType ()), # start_station_latitude
111
- (dirty_data (convert_angle , True ), StringType ()), # start_station_longitude
112
- (id , IntegerType ()), # end_station_id
113
- (dirty_data (station_name , False ), StringType ()), # end_station_name
114
- (dirty_data (convert_angle , True ), StringType ()), # end_station_latitude
115
- (dirty_data (convert_angle , True ), StringType ()), # end_station_longitude
116
- (id , IntegerType ()), # bikeid
117
- (dirty_data (user_type , False ), StringType ()), # usertype
118
- (id , IntegerType ()), # birth_year
119
- (dirty_data (gender , False ), StringType ()), # gender
120
- (id , StringType ()), # customer_plan
121
- ]
122
-
123
- # Apply dirty transformations to df
124
- names = df .schema .names
125
- new_df = df .select (* [UserDefinedFunction (* udf )(column ).alias (name )
126
- for udf , column , name in zip (udfs , df .columns , names )])
127
-
128
- # Duplicate about 0.01% of the rows
129
- dup_df = new_df .sample (False , 0.0001 , seed = 42 )
130
-
131
- # Create final dirty dataframe
132
- df = new_df .union (dup_df )
133
- df .sample (False , 0.0001 , seed = 50 ).show (n = 200 )
134
- print ("Dataframe sample printed" )
135
-
136
- # Write to BigQuery
137
- if upload :
138
87
# Create BigQuery Dataset
139
88
client = bigquery .Client ()
140
89
dataset_id = f'{ client .project } .new_york_citibike_trips'
@@ -143,8 +92,68 @@ def id(x):
143
92
dataset = client .create_dataset (dataset )
144
93
145
94
# Saving the data to BigQuery
146
- spark .conf .set ('temporaryGcsBucket' , bucket_name )
95
+ spark .conf .set ('temporaryGcsBucket' , BUCKET_NAME )
147
96
148
97
df .write .format ('bigquery' ) \
149
98
.option ('table' , dataset_id + ".RAW_DATA" ) \
150
99
.save ()
100
+
101
+ def main ():
102
+ # Create a SparkSession under the name "setup". Viewable via the Spark UI
103
+ spark = SparkSession .builder .appName ("setup" ).getOrCreate ()
104
+
105
+ upload = True # Whether to upload data to BigQuery
106
+
107
+ # Check whether or not results should be uploaded
108
+ if len (sys .argv ) > 1 :
109
+ upload = False
110
+ print ("Not uploading results to BigQuery" )
111
+ else :
112
+ print ("Results will be uploaded to BigQuery" )
113
+
114
+ # Check if table exists
115
+ try :
116
+ df = spark .read .format ('bigquery' ).option ('table' , TABLE ).load ()
117
+ except Py4JJavaError :
118
+ print (f"{ TABLE } does not exist. " )
119
+ sys .exit (0 )
120
+
121
+ # Declare data transformations for each column in dataframe
122
+ udfs = [
123
+ (dirty_data (trip_duration , True ), StringType ()), # tripduration
124
+ (dirty_data (id , True ), StringType ()), # starttime
125
+ (dirty_data (id , True ), StringType ()), # stoptime
126
+ (id , IntegerType ()), # start_station_id
127
+ (dirty_data (station_name , False ), StringType ()), # start_station_name
128
+ (dirty_data (convert_angle , True ), StringType ()), # start_station_latitude
129
+ (dirty_data (convert_angle , True ), StringType ()), # start_station_longitude
130
+ (id , IntegerType ()), # end_station_id
131
+ (dirty_data (station_name , False ), StringType ()), # end_station_name
132
+ (dirty_data (convert_angle , True ), StringType ()), # end_station_latitude
133
+ (dirty_data (convert_angle , True ), StringType ()), # end_station_longitude
134
+ (id , IntegerType ()), # bikeid
135
+ (dirty_data (user_type , False ), StringType ()), # usertype
136
+ (id , IntegerType ()), # birth_year
137
+ (dirty_data (gender , False ), StringType ()), # gender
138
+ (id , StringType ()), # customer_plan
139
+ ]
140
+
141
+ # Apply dirty transformations to df
142
+ names = df .schema .names
143
+ new_df = df .select (* [UserDefinedFunction (* udf )(column ).alias (name )
144
+ for udf , column , name in zip (udfs , df .columns , names )])
145
+
146
+ # Duplicate about 0.01% of the rows
147
+ dup_df = new_df .sample (False , 0.0001 , seed = 42 )
148
+
149
+ # Create final dirty dataframe
150
+ df = new_df .union (dup_df )
151
+ df .sample (False , 0.0001 , seed = 50 ).show (n = 200 )
152
+ print ("Dataframe sample printed" )
153
+
154
+ if upload :
155
+ write_to_bigquery (df )
156
+
157
+
158
+ if __name__ == '__main__' :
159
+ main ()
0 commit comments