Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 744f80c

Browse files
committed
get dataproc job output and fix linting
1 parent 4afbf1c commit 744f80c

File tree

3 files changed

+22
-21
lines changed

3 files changed

+22
-21
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ credentials.dat
2727
.DS_store
2828
env/
2929
.idea
30+
data-science-onramp/data-ingestion/noxfile.py

data-science-onramp/data-ingestion/setup.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from pyspark.sql.types import IntegerType, StringType
1313

1414

15-
BUCKET_NAME = sys.argv[1]
15+
BUCKET_NAME = sys.argv[1]
1616
TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
1717

1818

@@ -81,7 +81,8 @@ def udf(col_value):
8181
def id(x):
8282
return x
8383

84-
def write_to_bigquery(df):
84+
85+
def write_to_bigquery(spark, df):
8586
'''Write a dataframe to BigQuery'''
8687

8788
# Create BigQuery Dataset
@@ -98,6 +99,7 @@ def write_to_bigquery(df):
9899
.option('table', dataset_id + ".RAW_DATA") \
99100
.save()
100101

102+
101103
def main():
102104
# Create a SparkSession under the name "setup". Viewable via the Spark UI
103105
spark = SparkSession.builder.appName("setup").getOrCreate()
@@ -143,16 +145,16 @@ def main():
143145
new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
144146
for udf, column, name in zip(udfs, df.columns, names)])
145147

148+
new_df.sample(False, 0.0001, seed=50).show(n=100)
149+
146150
# Duplicate about 0.01% of the rows
147-
dup_df = new_df.sample(False, 0.0001, seed=42)
151+
dup_df = new_df.sample(True, 0.0001, seed=42)
148152

149153
# Create final dirty dataframe
150154
df = new_df.union(dup_df)
151-
df.sample(False, 0.0001, seed=50).show(n=200)
152-
print("Dataframe sample printed")
153155

154156
if upload:
155-
write_to_bigquery(df)
157+
write_to_bigquery(spark, df)
156158

157159

158160
if __name__ == '__main__':

data-science-onramp/data-ingestion/setup_test.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,8 @@
33

44
import uuid
55

6-
from google.api_core.exceptions import GoogleAPICallError
7-
86
from google.cloud import dataproc_v1 as dataproc
97
from google.cloud import storage
10-
from google.cloud.exceptions import NotFound
118

129
import pytest
1310

@@ -52,7 +49,7 @@ def setup_and_teardown_cluster():
5249
}
5350
],
5451
"software_config": {
55-
"image_version": "1.4-debian10",
52+
"image_version": "1.5.4-debian10",
5653
"optional_components": [
5754
"ANACONDA"
5855
],
@@ -96,6 +93,7 @@ def setup_and_teardown_bucket():
9693
bucket = storage_client.get_bucket(BUCKET_NAME)
9794
bucket.delete(force=True)
9895

96+
9997
def test_setup(capsys):
10098
'''Tests setup.py by submitting it to a dataproc cluster'''
10199

@@ -129,22 +127,15 @@ def test_setup(capsys):
129127
})
130128

131129
response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
132-
job=job_details)
130+
job=job_details)
133131

134132
# Wait for job to complete
135133
result = response.result()
136134

137-
cluster_client = dataproc.ClusterControllerClient(client_options={
138-
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
139-
})
140-
141-
cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME)
142-
143135
# Get job output
144-
output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000"
145-
storage_client = storage.Client()
146-
bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
147-
output = bucket.blob(output_location).download_as_string().decode("utf-8")
136+
output_location = result.driver_output_resource_uri + ".000000000"
137+
blob = get_blob_from_path(output_location)
138+
out = blob.download_as_string().decode("utf-8")
148139

149140
# tripDuration
150141
assert re.search("[0-9] s", out)
@@ -181,3 +172,10 @@ def test_setup(capsys):
181172

182173
# Missing data
183174
assert "null" in out
175+
176+
177+
def get_blob_from_path(path):
178+
bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
179+
bucket = storage.Client().get_bucket(bucket_name)
180+
output_location = re.search("google-cloud-dataproc.+", path).group(0)
181+
return bucket.blob(output_location)

0 commit comments

Comments
 (0)