1
- from random import choice , choices , randint , seed
1
+ import random
2
2
import sys
3
3
4
4
from time import time_ns
19
19
upload = True # Whether to upload data to BigQuery
20
20
21
21
# Check whether or not results should be uploaded
22
- try :
23
- sys .argv [2 ]
22
+ if len (sys .arv ) > 1 :
24
23
upload = False
25
- except IndexError :
24
+ print ("Not uploading results to BigQuery" )
25
+ else :
26
26
print ("Results will be uploaded to BigQuery" )
27
27
28
28
table = "bigquery-public-data.new_york_citibike.citibike_trips"
37
37
# START MAKING DATA DIRTY
38
38
39
39
40
- def random_select (items , cum_weights ):
40
+ def random_select (items , weights ):
41
41
'''Picks an item according to the cumulative weights'''
42
- return choices (items , cum_weights = cum_weights , k = 1 )[0 ]
42
+ return random . choices (items , weights = weights , k = 1 )[0 ]
43
43
44
44
45
- def tripduration (duration ):
45
+ def trip_duration (duration ):
46
46
'''Converts trip duration to other units'''
47
47
seconds = str (duration ) + " s"
48
48
minutes = str (float (duration ) / 60 ) + " min"
49
49
hours = str (float (duration ) / 3600 ) + " h"
50
- return random_select ([seconds , minutes , hours , str (randint (- 1000 , - 1 ))],
51
- [0.3 , 0.6 , 0.9 , 1 ])
50
+ return random_select ([seconds , minutes , hours ,
51
+ str (random .randint (- 1000 , - 1 ))],
52
+ [0.3 , 0.3 , 0.3 , 0.1 ])
52
53
53
54
54
55
def station_name (name ):
55
56
'''Replaces '&' with '/' with a 50% chance'''
56
- return choice ([name , name .replace ("&" , "/" )])
57
+ return random . choice ([name , name .replace ("&" , "/" )])
57
58
58
59
59
- def usertype (user ):
60
+ def user_type (user ):
60
61
'''Manipulates the user type string'''
61
- return choice ([user , user .upper (), user .lower (),
62
- "sub" if user == "Subscriber" else user ,
63
- "cust" if user == "Customer" else user ])
62
+ return random . choice ([user , user .upper (), user .lower (),
63
+ "sub" if user == "Subscriber" else user ,
64
+ "cust" if user == "Customer" else user ])
64
65
65
66
66
67
def gender (s ):
67
68
'''Manipulates the gender string'''
68
- return choice ([s , s .upper (), s .lower (),
69
- s [0 ] if len (s ) > 0 else "" ,
70
- s [0 ].lower () if len (s ) > 0 else "" ])
69
+ return random . choice ([s , s .upper (), s .lower (),
70
+ s [0 ] if len (s ) > 0 else "" ,
71
+ s [0 ].lower () if len (s ) > 0 else "" ])
71
72
72
73
73
- def convertAngle (angle ):
74
+ def convert_angle (angle ):
74
75
'''Converts long and lat to DMS notation'''
75
76
degrees = int (angle )
76
77
minutes = int ((angle - degrees ) * 60 )
77
78
seconds = int ((angle - degrees - minutes / 60 ) * 3600 )
78
79
new_angle = str (degrees ) + u"\u00B0 " + \
79
80
str (minutes ) + "'" + str (seconds ) + '"'
80
- return random_select ([str (angle ), new_angle ], cum_weights = [0.55 , 1 ])
81
+ return random_select ([str (angle ), new_angle ], [0.55 , 0.45 ])
81
82
82
83
83
84
def dirty_data (proc_func , allow_none ):
84
85
'''Master function returns a user defined function
85
86
that transforms the column data'''
86
87
def udf (col_value ):
87
- seed (hash (col_value ) + time_ns ())
88
+ random . seed (hash (col_value ) + time_ns ())
88
89
if col_value is None :
89
90
return col_value
90
91
elif allow_none :
91
92
return random_select ([None , proc_func (col_value )],
92
- cum_weights = [0.05 , 1 ])
93
+ [0.05 , 0.95 ])
93
94
else :
94
95
return proc_func (col_value )
95
96
return udf
@@ -101,19 +102,19 @@ def id(x):
101
102
102
103
# Declare data transformations for each column in dataframe
103
104
udfs = [
104
- (dirty_data (tripduration , True ), StringType ()), # tripduration
105
+ (dirty_data (trip_duration , True ), StringType ()), # tripduration
105
106
(dirty_data (id , True ), StringType ()), # starttime
106
107
(dirty_data (id , True ), StringType ()), # stoptime
107
108
(id , IntegerType ()), # start_station_id
108
109
(dirty_data (station_name , False ), StringType ()), # start_station_name
109
- (dirty_data (convertAngle , True ), StringType ()), # start_station_latitude
110
- (dirty_data (convertAngle , True ), StringType ()), # start_station_longitude
110
+ (dirty_data (convert_angle , True ), StringType ()), # start_station_latitude
111
+ (dirty_data (convert_angle , True ), StringType ()), # start_station_longitude
111
112
(id , IntegerType ()), # end_station_id
112
113
(dirty_data (station_name , False ), StringType ()), # end_station_name
113
- (dirty_data (convertAngle , True ), StringType ()), # end_station_latitude
114
- (dirty_data (convertAngle , True ), StringType ()), # end_station_longitude
114
+ (dirty_data (convert_angle , True ), StringType ()), # end_station_latitude
115
+ (dirty_data (convert_angle , True ), StringType ()), # end_station_longitude
115
116
(id , IntegerType ()), # bikeid
116
- (dirty_data (usertype , False ), StringType ()), # usertype
117
+ (dirty_data (user_type , False ), StringType ()), # usertype
117
118
(id , IntegerType ()), # birth_year
118
119
(dirty_data (gender , False ), StringType ()), # gender
119
120
(id , StringType ()), # customer_plan
@@ -136,7 +137,7 @@ def id(x):
136
137
if upload :
137
138
# Create BigQuery Dataset
138
139
client = bigquery .Client ()
139
- dataset_id = '{ }.new_york_citibike_trips'. format ( client . project )
140
+ dataset_id = f' { client . project } .new_york_citibike_trips'
140
141
dataset = bigquery .Dataset (dataset_id )
141
142
dataset .location = "US"
142
143
dataset = client .create_dataset (dataset )
0 commit comments