Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 92cf763

Browse files
committed
add data ingestion code
1 parent 15fede4 commit 92cf763

File tree

5 files changed

+372
-0
lines changed

5 files changed

+372
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pytest==5.3.2
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
grpcio==1.29.0
2+
google-auth==1.16.0
3+
google-auth-httplib2==0.0.3
4+
google-cloud==0.34.0
5+
google-cloud-storage==1.28.1
6+
google-cloud-dataproc==0.8.0
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import os
2+
import re
3+
4+
import uuid
5+
6+
from google.api_core.exceptions import GoogleAPICallError
7+
8+
from google.cloud import dataproc_v1 as dataproc
9+
from google.cloud import storage
10+
from google.cloud.exceptions import NotFound
11+
12+
import pytest
13+
14+
waiting_cluster_callback = False
15+
16+
# Set global variables
17+
project = os.environ['GCLOUD_PROJECT']
18+
region = "us-central1"
19+
zone = "us-central1-a"
20+
cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
21+
bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
22+
23+
24+
@pytest.fixture(autouse=True)
25+
def teardown():
26+
yield
27+
28+
# Delete cluster
29+
cluster_client = dataproc.ClusterControllerClient(client_options={
30+
'api_endpoint': f'{region}-dataproc.googleapis.com:443'
31+
})
32+
33+
try:
34+
operation = cluster_client.delete_cluster(project, region,
35+
cluster_name)
36+
operation.result()
37+
except GoogleAPICallError:
38+
pass
39+
40+
# Delete GCS bucket
41+
storage_client = storage.Client()
42+
try:
43+
bucket = storage_client.get_bucket(bucket_name)
44+
bucket.delete(force=True)
45+
except NotFound:
46+
pass
47+
48+
49+
def test_setup(capsys):
50+
'''Tests setup.py by submitting it to a dataproc cluster'''
51+
52+
# Create GCS Bucket
53+
storage_client = storage.Client()
54+
bucket = storage_client.create_bucket(bucket_name)
55+
56+
# Upload file
57+
destination_blob_name = "setup.py"
58+
blob = bucket.blob(destination_blob_name)
59+
blob.upload_from_filename("setup.py")
60+
61+
job_file_name = "gs://" + bucket_name + "/setup.py"
62+
63+
# Create cluster configuration
64+
zone_uri = \
65+
'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
66+
project, zone)
67+
cluster_data = {
68+
'project_id': project,
69+
'cluster_name': cluster_name,
70+
'config': {
71+
'gce_cluster_config': {
72+
'zone_uri': zone_uri,
73+
"metadata": {
74+
"PIP_PACKAGES": "google-cloud-storage"
75+
},
76+
},
77+
'master_config': {
78+
'num_instances': 1,
79+
'machine_type_uri': 'n1-standard-8'
80+
},
81+
'worker_config': {
82+
'num_instances': 6,
83+
'machine_type_uri': 'n1-standard-8'
84+
},
85+
"initialization_actions": [
86+
{
87+
"executable_file": ("gs://dataproc-initialization-actions/"
88+
"python/pip-install.sh"),
89+
}
90+
],
91+
"software_config": {
92+
"image_version": "1.5.4-debian10",
93+
"optional_components": [
94+
"ANACONDA"
95+
],
96+
}
97+
}
98+
}
99+
100+
# Create cluster using cluster client
101+
cluster_client = dataproc.ClusterControllerClient(client_options={
102+
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
103+
})
104+
105+
cluster = cluster_client.create_cluster(project, region, cluster_data)
106+
cluster.add_done_callback(callback)
107+
108+
# Wait for cluster to provision
109+
global waiting_cluster_callback
110+
waiting_cluster_callback = True
111+
112+
wait_for_cluster_creation()
113+
114+
# Create job configuration
115+
job_details = {
116+
'placement': {
117+
'cluster_name': cluster_name
118+
},
119+
'pyspark_job': {
120+
'main_python_file_uri': job_file_name,
121+
'args': [
122+
bucket_name,
123+
"--test",
124+
],
125+
"jar_file_uris": [
126+
"gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
127+
],
128+
},
129+
}
130+
131+
# Submit job to dataproc cluster
132+
job_client = dataproc.JobControllerClient(client_options={
133+
'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
134+
})
135+
136+
result = job_client.submit_job(project_id=project, region=region,
137+
job=job_details)
138+
139+
job_id = result.reference.job_id
140+
print('Submitted job \"{}\".'.format(job_id))
141+
142+
# Wait for job to complete
143+
wait_for_job(job_client, job_id)
144+
145+
# Get job output
146+
cluster_info = cluster_client.get_cluster(project, region, cluster_name)
147+
bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
148+
output_blob = (
149+
'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
150+
.format(cluster_info.cluster_uuid, job_id))
151+
out = bucket.blob(output_blob).download_as_string().decode("utf-8")
152+
153+
# tripDuration
154+
assert re.search("[0-9] s", out)
155+
assert re.search("[0-9] m", out)
156+
assert re.search("[0-9] h", out)
157+
158+
# station latitude & longitude
159+
assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
160+
161+
# birth_year
162+
assert re.search("19[0-9][0-9]\\|", out)
163+
assert re.search("20[0-9][0-9]\\|", out)
164+
165+
# gender
166+
assert "M" in out
167+
assert "male" in out
168+
assert "MALE" in out
169+
assert "F" in out
170+
assert "female" in out
171+
assert "FEMALE" in out
172+
assert "u" in out
173+
assert "unknown" in out
174+
assert "UNKNOWN" in out
175+
176+
# customer_plan
177+
assert "Subscriber" in out
178+
assert "subscriber" in out
179+
assert "SUBSCRIBER" in out
180+
assert "sub" in out
181+
assert "Customer" in out
182+
assert "customer" in out
183+
assert "CUSTOMER" in out
184+
assert "cust" in out
185+
186+
# Missing data
187+
assert "null" in out
188+
189+
190+
def callback(operation_future):
191+
'''Sets a flag to stop waiting'''
192+
global waiting_cluster_callback
193+
waiting_cluster_callback = False
194+
195+
196+
def wait_for_cluster_creation():
197+
'''Waits for cluster to create'''
198+
while True:
199+
if not waiting_cluster_callback:
200+
break
201+
202+
203+
def wait_for_job(job_client, job_id):
204+
'''Waits for job to finish'''
205+
while True:
206+
job = job_client.get_job(project, region, job_id)
207+
assert job.status.State.Name(job.status.state) != "ERROR"
208+
209+
if job.status.State.Name(job.status.state) == "DONE":
210+
return
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
from random import choice, choices, randint, seed
2+
import sys
3+
4+
from time import time_ns
5+
6+
from google.cloud import bigquery
7+
8+
from py4j.protocol import Py4JJavaError
9+
from pyspark.sql import SparkSession
10+
11+
from pyspark.sql.functions import UserDefinedFunction
12+
from pyspark.sql.types import IntegerType, StringType
13+
14+
15+
# Create a SparkSession under the name "setup". Viewable via the Spark UI
16+
spark = SparkSession.builder.appName("setup").getOrCreate()
17+
18+
bucket_name = sys.argv[1]
19+
upload = True # Whether to upload data to BigQuery
20+
21+
# Check whether or not results should be uploaded
22+
try:
23+
sys.argv[2]
24+
upload = False
25+
except IndexError:
26+
print("Results will be uploaded to BigQuery")
27+
28+
table = "bigquery-public-data.new_york_citibike.citibike_trips"
29+
30+
# Check if table exists
31+
try:
32+
df = spark.read.format('bigquery').option('table', table).load()
33+
except Py4JJavaError:
34+
print(f"{table} does not exist. ")
35+
sys.exit(0)
36+
37+
# START MAKING DATA DIRTY
38+
39+
40+
def random_select(items, cum_weights):
41+
'''Picks an item according to the cumulative weights'''
42+
return choices(items, cum_weights=cum_weights, k=1)[0]
43+
44+
45+
def tripduration(duration):
46+
'''Converts trip duration to other units'''
47+
seconds = str(duration) + " s"
48+
minutes = str(float(duration) / 60) + " min"
49+
hours = str(float(duration) / 3600) + " h"
50+
return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
51+
[0.3, 0.6, 0.9, 1])
52+
53+
54+
def station_name(name):
55+
'''Replaces '&' with '/' with a 50% chance'''
56+
return choice([name, name.replace("&", "/")])
57+
58+
59+
def usertype(user):
60+
'''Manipulates the user type string'''
61+
return choice([user, user.upper(), user.lower(),
62+
"sub" if user == "Subscriber" else user,
63+
"cust" if user == "Customer" else user])
64+
65+
66+
def gender(s):
67+
'''Manipulates the gender string'''
68+
return choice([s, s.upper(), s.lower(),
69+
s[0] if len(s) > 0 else "",
70+
s[0].lower() if len(s) > 0 else ""])
71+
72+
73+
def convertAngle(angle):
74+
'''Converts long and lat to DMS notation'''
75+
degrees = int(angle)
76+
minutes = int((angle - degrees) * 60)
77+
seconds = int((angle - degrees - minutes/60) * 3600)
78+
new_angle = str(degrees) + u"\u00B0" + \
79+
str(minutes) + "'" + str(seconds) + '"'
80+
return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
81+
82+
83+
def dirty_data(proc_func, allow_none):
84+
'''Master function returns a user defined function
85+
that transforms the column data'''
86+
def udf(col_value):
87+
seed(hash(col_value) + time_ns())
88+
if col_value is None:
89+
return col_value
90+
elif allow_none:
91+
return random_select([None, proc_func(col_value)],
92+
cum_weights=[0.05, 1])
93+
else:
94+
return proc_func(col_value)
95+
return udf
96+
97+
98+
def id(x):
99+
return x
100+
101+
102+
# Declare data transformations for each column in dataframe
103+
udfs = [
104+
(dirty_data(tripduration, True), StringType()), # tripduration
105+
(dirty_data(id, True), StringType()), # starttime
106+
(dirty_data(id, True), StringType()), # stoptime
107+
(id, IntegerType()), # start_station_id
108+
(dirty_data(station_name, False), StringType()), # start_station_name
109+
(dirty_data(convertAngle, True), StringType()), # start_station_latitude
110+
(dirty_data(convertAngle, True), StringType()), # start_station_longitude
111+
(id, IntegerType()), # end_station_id
112+
(dirty_data(station_name, False), StringType()), # end_station_name
113+
(dirty_data(convertAngle, True), StringType()), # end_station_latitude
114+
(dirty_data(convertAngle, True), StringType()), # end_station_longitude
115+
(id, IntegerType()), # bikeid
116+
(dirty_data(usertype, False), StringType()), # usertype
117+
(id, IntegerType()), # birth_year
118+
(dirty_data(gender, False), StringType()), # gender
119+
(id, StringType()), # customer_plan
120+
]
121+
122+
# Apply dirty transformations to df
123+
names = df.schema.names
124+
new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
125+
for udf, column, name in zip(udfs, df.columns, names)])
126+
127+
# Duplicate about 0.01% of the rows
128+
dup_df = new_df.sample(False, 0.0001, seed=42)
129+
130+
# Create final dirty dataframe
131+
df = new_df.union(dup_df)
132+
df.sample(False, 0.0001, seed=50).show(n=200)
133+
print("Dataframe sample printed")
134+
135+
# Write to BigQuery
136+
if upload:
137+
# Create BigQuery Dataset
138+
client = bigquery.Client()
139+
dataset_id = '{}.new_york_citibike_trips'.format(client.project)
140+
dataset = bigquery.Dataset(dataset_id)
141+
dataset.location = "US"
142+
dataset = client.create_dataset(dataset)
143+
144+
# Saving the data to BigQuery
145+
spark.conf.set('temporaryGcsBucket', bucket_name)
146+
147+
df.write.format('bigquery') \
148+
.option('table', dataset_id + ".RAW_DATA") \
149+
.save()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Submit a PySpark job via the Cloud Dataproc Jobs API
2+
gcloud dataproc jobs submit pyspark \
3+
--cluster ${CLUSTER_NAME} \
4+
--jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
5+
--driver-log-levels root=FATAL \
6+
setup.py -- ${BUCKET_NAME}

0 commit comments

Comments
 (0)