1
1
import random
2
2
import sys
3
-
4
3
from time import time_ns
5
4
6
5
from google .cloud import bigquery
7
-
8
6
from py4j .protocol import Py4JJavaError
9
7
from pyspark .sql import SparkSession
10
-
11
8
from pyspark .sql .functions import UserDefinedFunction
12
9
from pyspark .sql .types import IntegerType , StringType
13
10
@@ -56,7 +53,7 @@ def convert_angle(angle):
56
53
degrees = int (angle )
57
54
minutes = int ((angle - degrees ) * 60 )
58
55
seconds = int ((angle - degrees - minutes / 60 ) * 3600 )
59
- new_angle = str (degrees ) + u "\u00B0 " + \
56
+ new_angle = str (degrees ) + "\u00B0 " + \
60
57
str (minutes ) + "'" + str (seconds ) + '"'
61
58
return random_select ([str (angle ), new_angle ], [0.55 , 0.45 ])
62
59
@@ -78,13 +75,7 @@ def udf(col_value):
78
75
return udf
79
76
80
77
81
- # This function is required because we need to apply a
82
- # function for every column and some columns do not change
83
- def identity (x ):
84
- return x
85
-
86
-
87
- def write_to_bigquery (spark , df ):
78
+ def write_to_bigquery (df ):
88
79
'''Write a dataframe to BigQuery'''
89
80
90
81
# Create BigQuery Dataset
@@ -95,10 +86,9 @@ def write_to_bigquery(spark, df):
95
86
dataset = client .create_dataset (dataset )
96
87
97
88
# Saving the data to BigQuery
98
- spark .conf .set ('temporaryGcsBucket' , BUCKET_NAME )
99
-
100
89
df .write .format ('bigquery' ) \
101
90
.option ('table' , dataset_id + ".RAW_DATA" ) \
91
+ .option ("temporaryGcsBucket" , BUCKET_NAME ) \
102
92
.save ()
103
93
104
94
@@ -109,7 +99,7 @@ def main():
109
99
upload = True # Whether to upload data to BigQuery
110
100
111
101
# Check whether or not results should be uploaded
112
- if len (sys .argv ) > 1 :
102
+ if len (sys .argv ) > 2 :
113
103
upload = False
114
104
print ("Not uploading results to BigQuery" )
115
105
else :
@@ -125,21 +115,21 @@ def main():
125
115
# Declare data transformations for each column in dataframe
126
116
udfs = [
127
117
(dirty_data (trip_duration , True ), StringType ()), # tripduration
128
- (dirty_data (identity , True ), StringType ()), # starttime
129
- (dirty_data (identity , True ), StringType ()), # stoptime
130
- (identity , IntegerType ()), # start_station_id
118
+ (dirty_data (lambda x : x , True ), StringType ()), # starttime
119
+ (dirty_data (lambda x : x , True ), StringType ()), # stoptime
120
+ (lambda x : x , IntegerType ()), # start_station_id
131
121
(dirty_data (station_name , False ), StringType ()), # start_station_name
132
122
(dirty_data (convert_angle , True ), StringType ()), # start_station_latitude
133
123
(dirty_data (convert_angle , True ), StringType ()), # start_station_longitude
134
- (identity , IntegerType ()), # end_station_id
124
+ (lambda x : x , IntegerType ()), # end_station_id
135
125
(dirty_data (station_name , False ), StringType ()), # end_station_name
136
126
(dirty_data (convert_angle , True ), StringType ()), # end_station_latitude
137
127
(dirty_data (convert_angle , True ), StringType ()), # end_station_longitude
138
- (identity , IntegerType ()), # bikeid
128
+ (lambda x : x , IntegerType ()), # bikeid
139
129
(dirty_data (user_type , False ), StringType ()), # usertype
140
- (identity , IntegerType ()), # birth_year
130
+ (lambda x : x , IntegerType ()), # birth_year
141
131
(dirty_data (gender , False ), StringType ()), # gender
142
- (identity , StringType ()), # customer_plan
132
+ (lambda x : x , StringType ()), # customer_plan
143
133
]
144
134
145
135
# Apply dirty transformations to df
@@ -156,7 +146,7 @@ def main():
156
146
df = new_df .union (dup_df )
157
147
158
148
if upload :
159
- write_to_bigquery (spark , df )
149
+ write_to_bigquery (df )
160
150
161
151
162
152
if __name__ == '__main__' :
0 commit comments