Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 739114a

Browse files
committed
begin addressing comments
1 parent 92cf763 commit 739114a

File tree

3 files changed

+90
-115
lines changed

3 files changed

+90
-115
lines changed

data-science-onramp/data-ingestion/setup.py

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from random import choice, choices, randint, seed
1+
import random
22
import sys
33

44
from time import time_ns
@@ -19,10 +19,10 @@
1919
upload = True # Whether to upload data to BigQuery
2020

2121
# Check whether or not results should be uploaded
22-
try:
23-
sys.argv[2]
22+
if len(sys.arv) > 1:
2423
upload = False
25-
except IndexError:
24+
print("Not uploading results to BigQuery")
25+
else:
2626
print("Results will be uploaded to BigQuery")
2727

2828
table = "bigquery-public-data.new_york_citibike.citibike_trips"
@@ -37,59 +37,60 @@
3737
# START MAKING DATA DIRTY
3838

3939

40-
def random_select(items, cum_weights):
40+
def random_select(items, weights):
4141
'''Picks an item according to the cumulative weights'''
42-
return choices(items, cum_weights=cum_weights, k=1)[0]
42+
return random.choices(items, weights=weights, k=1)[0]
4343

4444

45-
def tripduration(duration):
45+
def trip_duration(duration):
4646
'''Converts trip duration to other units'''
4747
seconds = str(duration) + " s"
4848
minutes = str(float(duration) / 60) + " min"
4949
hours = str(float(duration) / 3600) + " h"
50-
return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
51-
[0.3, 0.6, 0.9, 1])
50+
return random_select([seconds, minutes, hours,
51+
str(random.randint(-1000, -1))],
52+
[0.3, 0.3, 0.3, 0.1])
5253

5354

5455
def station_name(name):
5556
'''Replaces '&' with '/' with a 50% chance'''
56-
return choice([name, name.replace("&", "/")])
57+
return random.choice([name, name.replace("&", "/")])
5758

5859

59-
def usertype(user):
60+
def user_type(user):
6061
'''Manipulates the user type string'''
61-
return choice([user, user.upper(), user.lower(),
62-
"sub" if user == "Subscriber" else user,
63-
"cust" if user == "Customer" else user])
62+
return random.choice([user, user.upper(), user.lower(),
63+
"sub" if user == "Subscriber" else user,
64+
"cust" if user == "Customer" else user])
6465

6566

6667
def gender(s):
6768
'''Manipulates the gender string'''
68-
return choice([s, s.upper(), s.lower(),
69-
s[0] if len(s) > 0 else "",
70-
s[0].lower() if len(s) > 0 else ""])
69+
return random.choice([s, s.upper(), s.lower(),
70+
s[0] if len(s) > 0 else "",
71+
s[0].lower() if len(s) > 0 else ""])
7172

7273

73-
def convertAngle(angle):
74+
def convert_angle(angle):
7475
'''Converts long and lat to DMS notation'''
7576
degrees = int(angle)
7677
minutes = int((angle - degrees) * 60)
7778
seconds = int((angle - degrees - minutes/60) * 3600)
7879
new_angle = str(degrees) + u"\u00B0" + \
7980
str(minutes) + "'" + str(seconds) + '"'
80-
return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
81+
return random_select([str(angle), new_angle], [0.55, 0.45])
8182

8283

8384
def dirty_data(proc_func, allow_none):
8485
'''Master function returns a user defined function
8586
that transforms the column data'''
8687
def udf(col_value):
87-
seed(hash(col_value) + time_ns())
88+
random.seed(hash(col_value) + time_ns())
8889
if col_value is None:
8990
return col_value
9091
elif allow_none:
9192
return random_select([None, proc_func(col_value)],
92-
cum_weights=[0.05, 1])
93+
[0.05, 0.95])
9394
else:
9495
return proc_func(col_value)
9596
return udf
@@ -101,19 +102,19 @@ def id(x):
101102

102103
# Declare data transformations for each column in dataframe
103104
udfs = [
104-
(dirty_data(tripduration, True), StringType()), # tripduration
105+
(dirty_data(trip_duration, True), StringType()), # tripduration
105106
(dirty_data(id, True), StringType()), # starttime
106107
(dirty_data(id, True), StringType()), # stoptime
107108
(id, IntegerType()), # start_station_id
108109
(dirty_data(station_name, False), StringType()), # start_station_name
109-
(dirty_data(convertAngle, True), StringType()), # start_station_latitude
110-
(dirty_data(convertAngle, True), StringType()), # start_station_longitude
110+
(dirty_data(convert_angle, True), StringType()), # start_station_latitude
111+
(dirty_data(convert_angle, True), StringType()), # start_station_longitude
111112
(id, IntegerType()), # end_station_id
112113
(dirty_data(station_name, False), StringType()), # end_station_name
113-
(dirty_data(convertAngle, True), StringType()), # end_station_latitude
114-
(dirty_data(convertAngle, True), StringType()), # end_station_longitude
114+
(dirty_data(convert_angle, True), StringType()), # end_station_latitude
115+
(dirty_data(convert_angle, True), StringType()), # end_station_longitude
115116
(id, IntegerType()), # bikeid
116-
(dirty_data(usertype, False), StringType()), # usertype
117+
(dirty_data(user_type, False), StringType()), # usertype
117118
(id, IntegerType()), # birth_year
118119
(dirty_data(gender, False), StringType()), # gender
119120
(id, StringType()), # customer_plan
@@ -136,7 +137,7 @@ def id(x):
136137
if upload:
137138
# Create BigQuery Dataset
138139
client = bigquery.Client()
139-
dataset_id = '{}.new_york_citibike_trips'.format(client.project)
140+
dataset_id = f'{client.project}.new_york_citibike_trips'
140141
dataset = bigquery.Dataset(dataset_id)
141142
dataset.location = "US"
142143
dataset = client.create_dataset(dataset)

data-science-onramp/data-ingestion/setup.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# Submit a PySpark job via the Cloud Dataproc Jobs API
2+
# Requires having CLUSTER_NAME and BUCKET_NAME set as
3+
# environment variables
4+
25
gcloud dataproc jobs submit pyspark \
36
--cluster ${CLUSTER_NAME} \
47
--jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \

data-science-onramp/data-ingestion/setup-test.py renamed to data-science-onramp/data-ingestion/setup_test.py

Lines changed: 58 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -11,62 +11,25 @@
1111

1212
import pytest
1313

14-
waiting_cluster_callback = False
1514

1615
# Set global variables
17-
project = os.environ['GCLOUD_PROJECT']
18-
region = "us-central1"
19-
zone = "us-central1-a"
20-
cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
21-
bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
16+
PROJECT = os.environ['GCLOUD_PROJECT']
17+
REGION = "us-central1"
18+
ZONE = "us-central1-a"
19+
CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
20+
BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
2221

22+
BUCKET = None
2323

24-
@pytest.fixture(autouse=True)
25-
def teardown():
26-
yield
27-
28-
# Delete cluster
29-
cluster_client = dataproc.ClusterControllerClient(client_options={
30-
'api_endpoint': f'{region}-dataproc.googleapis.com:443'
31-
})
32-
33-
try:
34-
operation = cluster_client.delete_cluster(project, region,
35-
cluster_name)
36-
operation.result()
37-
except GoogleAPICallError:
38-
pass
39-
40-
# Delete GCS bucket
41-
storage_client = storage.Client()
42-
try:
43-
bucket = storage_client.get_bucket(bucket_name)
44-
bucket.delete(force=True)
45-
except NotFound:
46-
pass
47-
48-
49-
def test_setup(capsys):
50-
'''Tests setup.py by submitting it to a dataproc cluster'''
51-
52-
# Create GCS Bucket
53-
storage_client = storage.Client()
54-
bucket = storage_client.create_bucket(bucket_name)
55-
56-
# Upload file
57-
destination_blob_name = "setup.py"
58-
blob = bucket.blob(destination_blob_name)
59-
blob.upload_from_filename("setup.py")
60-
61-
job_file_name = "gs://" + bucket_name + "/setup.py"
6224

25+
@pytest.fixture(autouse=True)
26+
def setup_and_teardown_cluster():
6327
# Create cluster configuration
6428
zone_uri = \
65-
'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
66-
project, zone)
29+
f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
6730
cluster_data = {
68-
'project_id': project,
69-
'cluster_name': cluster_name,
31+
'project_id': PROJECT,
32+
'cluster_name': CLUSTER_NAME,
7033
'config': {
7134
'gce_cluster_config': {
7235
'zone_uri': zone_uri,
@@ -99,27 +62,59 @@ def test_setup(capsys):
9962

10063
# Create cluster using cluster client
10164
cluster_client = dataproc.ClusterControllerClient(client_options={
102-
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
65+
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
10366
})
10467

105-
cluster = cluster_client.create_cluster(project, region, cluster_data)
106-
cluster.add_done_callback(callback)
68+
operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
10769

10870
# Wait for cluster to provision
109-
global waiting_cluster_callback
110-
waiting_cluster_callback = True
71+
operation.result()
11172

112-
wait_for_cluster_creation()
73+
yield
74+
75+
# Delete cluster
76+
cluster_client = dataproc.ClusterControllerClient(client_options={
77+
'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
78+
})
79+
80+
operation = cluster_client.delete_cluster(PROJECT, REGION,
81+
CLUSTER_NAME)
82+
operation.result()
83+
84+
85+
@pytest.fixture(autouse=True)
86+
def setup_and_teardown_bucket():
87+
global BUCKET
88+
# Create GCS Bucket
89+
storage_client = storage.Client()
90+
BUCKET = storage_client.create_bucket(BUCKET_NAME)
91+
92+
yield
93+
94+
# Delete GCS bucket
95+
storage_client = storage.Client()
96+
bucket = storage_client.get_bucket(BUCKET_NAME)
97+
bucket.delete(force=True)
98+
99+
def test_setup(capsys):
100+
'''Tests setup.py by submitting it to a dataproc cluster'''
101+
102+
# Upload file
103+
destination_blob_name = "setup.py"
104+
blob = BUCKET.blob(destination_blob_name)
105+
blob.upload_from_filename("setup.py")
106+
107+
job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
113108

114109
# Create job configuration
115110
job_details = {
116111
'placement': {
117-
'cluster_name': cluster_name
112+
'cluster_name': CLUSTER_NAME
118113
},
119114
'pyspark_job': {
120115
'main_python_file_uri': job_file_name,
121116
'args': [
122-
bucket_name,
117+
BUCKET_NAME,
123118
"--test",
124119
],
125120
"jar_file_uris": [
@@ -130,25 +125,21 @@ def test_setup(capsys):
130125

131126
# Submit job to dataproc cluster
132127
job_client = dataproc.JobControllerClient(client_options={
133-
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
128+
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
134129
})
135130

136-
result = job_client.submit_job(project_id=project, region=region,
131+
response = job_client.submit_job(project_id=PROJECT, region=REGION,
137132
job=job_details)
138133

139-
job_id = result.reference.job_id
134+
job_id = response.reference.job_id
140135
print('Submitted job \"{}\".'.format(job_id))
141136

142137
# Wait for job to complete
143-
wait_for_job(job_client, job_id)
138+
result = response.add_done_callback(callback)
144139

145140
# Get job output
146-
cluster_info = cluster_client.get_cluster(project, region, cluster_name)
147-
bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
148-
output_blob = (
149-
'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
150-
.format(cluster_info.cluster_uuid, job_id))
151-
out = bucket.blob(output_blob).download_as_string().decode("utf-8")
141+
output_location = result.driver_output_resource_uri() + ".000000000"
142+
output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
152143

153144
# tripDuration
154145
assert re.search("[0-9] s", out)
@@ -186,25 +177,5 @@ def test_setup(capsys):
186177
# Missing data
187178
assert "null" in out
188179

189-
190180
def callback(operation_future):
191-
'''Sets a flag to stop waiting'''
192-
global waiting_cluster_callback
193-
waiting_cluster_callback = False
194-
195-
196-
def wait_for_cluster_creation():
197-
'''Waits for cluster to create'''
198-
while True:
199-
if not waiting_cluster_callback:
200-
break
201-
202-
203-
def wait_for_job(job_client, job_id):
204-
'''Waits for job to finish'''
205-
while True:
206-
job = job_client.get_job(project, region, job_id)
207-
assert job.status.State.Name(job.status.state) != "ERROR"
208-
209-
if job.status.State.Name(job.status.state) == "DONE":
210-
return
181+
return operation_future.result()

0 commit comments

Comments
 (0)