12
12
# Set global variables
13
13
PROJECT = os .environ ['GCLOUD_PROJECT' ]
14
14
REGION = "us-central1"
15
- ZONE = "us-central1-a"
16
15
CLUSTER_NAME = f'setup-test-{ uuid .uuid4 ()} '
17
16
BUCKET_NAME = f'setup-test-code-{ uuid .uuid4 ()} '
18
-
19
- BUCKET = None
17
+ DESTINATION_BLOB_NAME = "setup.py"
18
+ JOB_FILE_NAME = f'gs://{ BUCKET_NAME } /setup.py'
19
+ JOB_DETAILS = { # Job configuration
20
+ 'placement' : {
21
+ 'cluster_name' : CLUSTER_NAME
22
+ },
23
+ 'pyspark_job' : {
24
+ 'main_python_file_uri' : JOB_FILE_NAME ,
25
+ 'args' : [
26
+ BUCKET_NAME ,
27
+ "--test" ,
28
+ ],
29
+ "jar_file_uris" : [
30
+ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
31
+ ],
32
+ },
33
+ }
20
34
21
35
22
36
@pytest .fixture (autouse = True )
23
37
def setup_and_teardown_cluster ():
24
38
# Create cluster configuration
25
- zone_uri = \
26
- f'https://www.googleapis.com/compute/v1/projects/{ PROJECT } /zones/{ ZONE } '
27
39
cluster_data = {
28
40
'project_id' : PROJECT ,
29
41
'cluster_name' : CLUSTER_NAME ,
30
42
'config' : {
31
43
'gce_cluster_config' : {
32
- 'zone_uri' : zone_uri ,
44
+ 'zone_uri' : '' ,
33
45
"metadata" : {
34
46
"PIP_PACKAGES" : "google-cloud-storage"
35
47
},
@@ -59,9 +71,8 @@ def setup_and_teardown_cluster():
59
71
60
72
# Create cluster using cluster client
61
73
cluster_client = dataproc .ClusterControllerClient (client_options = {
62
- 'api_endpoint' : '{ }-dataproc.googleapis.com:443'. format ( REGION )
74
+ 'api_endpoint' : f' { REGION } -dataproc.googleapis.com:443'
63
75
})
64
-
65
76
operation = cluster_client .create_cluster (PROJECT , REGION , cluster_data )
66
77
67
78
# Wait for cluster to provision
@@ -70,64 +81,48 @@ def setup_and_teardown_cluster():
70
81
yield
71
82
72
83
# Delete cluster
73
- cluster_client = dataproc .ClusterControllerClient (client_options = {
74
- 'api_endpoint' : f'{ REGION } -dataproc.googleapis.com:443'
75
- })
76
-
77
84
operation = cluster_client .delete_cluster (PROJECT , REGION ,
78
85
CLUSTER_NAME )
79
86
operation .result ()
80
87
81
88
82
89
@pytest .fixture (autouse = True )
83
90
def setup_and_teardown_bucket ():
84
- global BUCKET
85
91
# Create GCS Bucket
86
92
storage_client = storage .Client ()
87
- BUCKET = storage_client .create_bucket (BUCKET_NAME )
93
+ bucket = storage_client .create_bucket (BUCKET_NAME )
94
+
95
+ # Upload file
96
+ blob = bucket .blob (DESTINATION_BLOB_NAME )
97
+ blob .upload_from_filename ("setup.py" )
88
98
89
99
yield
90
100
91
101
# Delete GCS bucket
92
- storage_client = storage .Client ()
93
102
bucket = storage_client .get_bucket (BUCKET_NAME )
94
103
bucket .delete (force = True )
95
104
96
105
97
- def test_setup (capsys ):
98
- '''Tests setup.py by submitting it to a dataproc cluster'''
106
+ def get_blob_from_path (path ):
107
+ bucket_name = re .search ("dataproc.+?/" , path ).group (0 )[0 :- 1 ]
108
+ bucket = storage .Client ().get_bucket (bucket_name )
109
+ output_location = re .search ("google-cloud-dataproc.+" , path ).group (0 )
110
+ return bucket .blob (output_location )
99
111
100
- # Upload file
101
- destination_blob_name = "setup.py"
102
- blob = BUCKET .blob (destination_blob_name )
103
- blob .upload_from_filename ("setup.py" )
104
112
105
- job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
106
-
107
- # Create job configuration
108
- job_details = {
109
- 'placement' : {
110
- 'cluster_name' : CLUSTER_NAME
111
- },
112
- 'pyspark_job' : {
113
- 'main_python_file_uri' : job_file_name ,
114
- 'args' : [
115
- BUCKET_NAME ,
116
- "--test" ,
117
- ],
118
- "jar_file_uris" : [
119
- "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
120
- ],
121
- },
122
- }
113
+ def is_in_table (value , out ):
114
+ return re .search (f"\| *{ value } \|" , out )
115
+
116
+
117
+ def test_setup ():
118
+ '''Tests setup.py by submitting it to a dataproc cluster'''
123
119
124
120
# Submit job to dataproc cluster
125
121
job_client = dataproc .JobControllerClient (client_options = {
126
- 'api_endpoint' : '{ }-dataproc.googleapis.com:443'. format ( REGION )
122
+ 'api_endpoint' : f' { REGION } -dataproc.googleapis.com:443'
127
123
})
128
-
129
124
response = job_client .submit_job_as_operation (project_id = PROJECT , region = REGION ,
130
- job = job_details )
125
+ job = JOB_DETAILS )
131
126
132
127
# Wait for job to complete
133
128
result = response .result ()
@@ -150,32 +145,28 @@ def test_setup(capsys):
150
145
assert re .search ("20[0-9][0-9]\\ |" , out )
151
146
152
147
# gender
153
- assert "M" in out
154
- assert "male" in out
155
- assert "MALE" in out
156
- assert "F" in out
157
- assert "female" in out
158
- assert "FEMALE" in out
159
- assert "u" in out
160
- assert "unknown" in out
161
- assert "UNKNOWN" in out
148
+ assert is_in_table ("M" , out )
149
+ assert is_in_table ("m" , out )
150
+ assert is_in_table ("male" , out )
151
+ assert is_in_table ("MALE" , out )
152
+ assert is_in_table ("F" , out )
153
+ assert is_in_table ("f" , out )
154
+ assert is_in_table ("female" , out )
155
+ assert is_in_table ("FEMALE" , out )
156
+ assert is_in_table ("U" , out )
157
+ assert is_in_table ("u" , out )
158
+ assert is_in_table ("unknown" , out )
159
+ assert is_in_table ("UNKNOWN" , out )
162
160
163
161
# customer_plan
164
- assert "Subscriber" in out
165
- assert "subscriber" in out
166
- assert "SUBSCRIBER" in out
167
- assert "sub" in out
168
- assert "Customer" in out
169
- assert "customer" in out
170
- assert "CUSTOMER" in out
171
- assert "cust" in out
162
+ assert is_in_table ( "Subscriber" , out )
163
+ assert is_in_table ( "subscriber" , out )
164
+ assert is_in_table ( "SUBSCRIBER" , out )
165
+ assert is_in_table ( "sub" , out )
166
+ assert is_in_table ( "Customer" , out )
167
+ assert is_in_table ( "customer" , out )
168
+ assert is_in_table ( "CUSTOMER" , out )
169
+ assert is_in_table ( "cust" , out )
172
170
173
171
# Missing data
174
- assert "null" in out
175
-
176
-
177
- def get_blob_from_path (path ):
178
- bucket_name = re .search ("dataproc.+?/" , path ).group (0 )[0 :- 1 ]
179
- bucket = storage .Client ().get_bucket (bucket_name )
180
- output_location = re .search ("google-cloud-dataproc.+" , path ).group (0 )
181
- return bucket .blob (output_location )
172
+ assert is_in_table ("null" , out )
0 commit comments