Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 8cd7dc6

Browse files
committed
fix PR comments
1 parent 744f80c commit 8cd7dc6

File tree

3 files changed

+73
-110
lines changed

3 files changed

+73
-110
lines changed

.gitignore

Lines changed: 0 additions & 30 deletions
This file was deleted.

data-science-onramp/data-ingestion/setup.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def user_type(user):
4646

4747
def gender(s):
4848
'''Manipulates the gender string'''
49-
return random.choice([s, s.upper(), s.lower(),
50-
s[0] if len(s) > 0 else "",
49+
return random.choice([s.upper(), s.lower(),
50+
s[0].upper() if len(s) > 0 else "",
5151
s[0].lower() if len(s) > 0 else ""])
5252

5353

@@ -78,7 +78,9 @@ def udf(col_value):
7878
return udf
7979

8080

81-
def id(x):
81+
# This function is required because we need to apply a
82+
# function for every column and some columns do not change
83+
def identity(x):
8284
return x
8385

8486

@@ -118,37 +120,37 @@ def main():
118120
df = spark.read.format('bigquery').option('table', TABLE).load()
119121
except Py4JJavaError:
120122
print(f"{TABLE} does not exist. ")
121-
sys.exit(0)
123+
return
122124

123125
# Declare data transformations for each column in dataframe
124126
udfs = [
125127
(dirty_data(trip_duration, True), StringType()), # tripduration
126-
(dirty_data(id, True), StringType()), # starttime
127-
(dirty_data(id, True), StringType()), # stoptime
128-
(id, IntegerType()), # start_station_id
128+
(dirty_data(identity, True), StringType()), # starttime
129+
(dirty_data(identity, True), StringType()), # stoptime
130+
(identity, IntegerType()), # start_station_id
129131
(dirty_data(station_name, False), StringType()), # start_station_name
130132
(dirty_data(convert_angle, True), StringType()), # start_station_latitude
131133
(dirty_data(convert_angle, True), StringType()), # start_station_longitude
132-
(id, IntegerType()), # end_station_id
134+
(identity, IntegerType()), # end_station_id
133135
(dirty_data(station_name, False), StringType()), # end_station_name
134136
(dirty_data(convert_angle, True), StringType()), # end_station_latitude
135137
(dirty_data(convert_angle, True), StringType()), # end_station_longitude
136-
(id, IntegerType()), # bikeid
138+
(identity, IntegerType()), # bikeid
137139
(dirty_data(user_type, False), StringType()), # usertype
138-
(id, IntegerType()), # birth_year
140+
(identity, IntegerType()), # birth_year
139141
(dirty_data(gender, False), StringType()), # gender
140-
(id, StringType()), # customer_plan
142+
(identity, StringType()), # customer_plan
141143
]
142144

143145
# Apply dirty transformations to df
144146
names = df.schema.names
145147
new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
146148
for udf, column, name in zip(udfs, df.columns, names)])
147149

148-
new_df.sample(False, 0.0001, seed=50).show(n=100)
150+
new_df.sample(False, 0.0001).show(n=100)
149151

150152
# Duplicate about 0.01% of the rows
151-
dup_df = new_df.sample(True, 0.0001, seed=42)
153+
dup_df = new_df.sample(True, 0.0001)
152154

153155
# Create final dirty dataframe
154156
df = new_df.union(dup_df)

data-science-onramp/data-ingestion/setup_test.py

Lines changed: 58 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,36 @@
1212
# Set global variables
1313
PROJECT = os.environ['GCLOUD_PROJECT']
1414
REGION = "us-central1"
15-
ZONE = "us-central1-a"
1615
CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
1716
BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
18-
19-
BUCKET = None
17+
DESTINATION_BLOB_NAME = "setup.py"
18+
JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
19+
JOB_DETAILS = { # Job configuration
20+
'placement': {
21+
'cluster_name': CLUSTER_NAME
22+
},
23+
'pyspark_job': {
24+
'main_python_file_uri': JOB_FILE_NAME,
25+
'args': [
26+
BUCKET_NAME,
27+
"--test",
28+
],
29+
"jar_file_uris": [
30+
"gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
31+
],
32+
},
33+
}
2034

2135

2236
@pytest.fixture(autouse=True)
2337
def setup_and_teardown_cluster():
2438
# Create cluster configuration
25-
zone_uri = \
26-
f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
2739
cluster_data = {
2840
'project_id': PROJECT,
2941
'cluster_name': CLUSTER_NAME,
3042
'config': {
3143
'gce_cluster_config': {
32-
'zone_uri': zone_uri,
44+
'zone_uri': '',
3345
"metadata": {
3446
"PIP_PACKAGES": "google-cloud-storage"
3547
},
@@ -59,9 +71,8 @@ def setup_and_teardown_cluster():
5971

6072
# Create cluster using cluster client
6173
cluster_client = dataproc.ClusterControllerClient(client_options={
62-
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
74+
'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
6375
})
64-
6576
operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
6677

6778
# Wait for cluster to provision
@@ -70,64 +81,48 @@ def setup_and_teardown_cluster():
7081
yield
7182

7283
# Delete cluster
73-
cluster_client = dataproc.ClusterControllerClient(client_options={
74-
'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
75-
})
76-
7784
operation = cluster_client.delete_cluster(PROJECT, REGION,
7885
CLUSTER_NAME)
7986
operation.result()
8087

8188

8289
@pytest.fixture(autouse=True)
8390
def setup_and_teardown_bucket():
84-
global BUCKET
8591
# Create GCS Bucket
8692
storage_client = storage.Client()
87-
BUCKET = storage_client.create_bucket(BUCKET_NAME)
93+
bucket = storage_client.create_bucket(BUCKET_NAME)
94+
95+
# Upload file
96+
blob = bucket.blob(DESTINATION_BLOB_NAME)
97+
blob.upload_from_filename("setup.py")
8898

8999
yield
90100

91101
# Delete GCS bucket
92-
storage_client = storage.Client()
93102
bucket = storage_client.get_bucket(BUCKET_NAME)
94103
bucket.delete(force=True)
95104

96105

97-
def test_setup(capsys):
98-
'''Tests setup.py by submitting it to a dataproc cluster'''
106+
def get_blob_from_path(path):
107+
bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
108+
bucket = storage.Client().get_bucket(bucket_name)
109+
output_location = re.search("google-cloud-dataproc.+", path).group(0)
110+
return bucket.blob(output_location)
99111

100-
# Upload file
101-
destination_blob_name = "setup.py"
102-
blob = BUCKET.blob(destination_blob_name)
103-
blob.upload_from_filename("setup.py")
104112

105-
job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
106-
107-
# Create job configuration
108-
job_details = {
109-
'placement': {
110-
'cluster_name': CLUSTER_NAME
111-
},
112-
'pyspark_job': {
113-
'main_python_file_uri': job_file_name,
114-
'args': [
115-
BUCKET_NAME,
116-
"--test",
117-
],
118-
"jar_file_uris": [
119-
"gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
120-
],
121-
},
122-
}
113+
def is_in_table(value, out):
114+
return re.search(f"\| *{value}\|", out)
115+
116+
117+
def test_setup():
118+
'''Tests setup.py by submitting it to a dataproc cluster'''
123119

124120
# Submit job to dataproc cluster
125121
job_client = dataproc.JobControllerClient(client_options={
126-
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
122+
'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
127123
})
128-
129124
response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
130-
job=job_details)
125+
job=JOB_DETAILS)
131126

132127
# Wait for job to complete
133128
result = response.result()
@@ -150,32 +145,28 @@ def test_setup(capsys):
150145
assert re.search("20[0-9][0-9]\\|", out)
151146

152147
# gender
153-
assert "M" in out
154-
assert "male" in out
155-
assert "MALE" in out
156-
assert "F" in out
157-
assert "female" in out
158-
assert "FEMALE" in out
159-
assert "u" in out
160-
assert "unknown" in out
161-
assert "UNKNOWN" in out
148+
assert is_in_table("M", out)
149+
assert is_in_table("m", out)
150+
assert is_in_table("male", out)
151+
assert is_in_table("MALE", out)
152+
assert is_in_table("F", out)
153+
assert is_in_table("f", out)
154+
assert is_in_table("female", out)
155+
assert is_in_table("FEMALE", out)
156+
assert is_in_table("U", out)
157+
assert is_in_table("u", out)
158+
assert is_in_table("unknown", out)
159+
assert is_in_table("UNKNOWN", out)
162160

163161
# customer_plan
164-
assert "Subscriber" in out
165-
assert "subscriber" in out
166-
assert "SUBSCRIBER" in out
167-
assert "sub" in out
168-
assert "Customer" in out
169-
assert "customer" in out
170-
assert "CUSTOMER" in out
171-
assert "cust" in out
162+
assert is_in_table("Subscriber", out)
163+
assert is_in_table("subscriber", out)
164+
assert is_in_table("SUBSCRIBER", out)
165+
assert is_in_table("sub", out)
166+
assert is_in_table("Customer", out)
167+
assert is_in_table("customer", out)
168+
assert is_in_table("CUSTOMER", out)
169+
assert is_in_table("cust", out)
172170

173171
# Missing data
174-
assert "null" in out
175-
176-
177-
def get_blob_from_path(path):
178-
bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
179-
bucket = storage.Client().get_bucket(bucket_name)
180-
output_location = re.search("google-cloud-dataproc.+", path).group(0)
181-
return bucket.blob(output_location)
172+
assert is_in_table("null", out)

0 commit comments

Comments
 (0)