diff --git a/dataproc/README.md b/dataproc/README.md index 98622be7dc1..150e90cc3f6 100644 --- a/dataproc/README.md +++ b/dataproc/README.md @@ -1,84 +1,3 @@ -# Cloud Dataproc API Examples +These samples have been moved. -[![Open in Cloud Shell][shell_img]][shell_link] - -[shell_img]: http://gstatic.com/cloudssh/images/open-btn.png -[shell_link]: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataproc/README.md - -Sample command-line programs for interacting with the Cloud Dataproc API. - -See [the tutorial on the using the Dataproc API with the Python client -library](https://cloud.google.com/dataproc/docs/tutorials/python-library-example) -for information on a walkthrough you can run to try out the Cloud Dataproc API sample code. - -Note that while this sample demonstrates interacting with Dataproc via the API, the functionality demonstrated here could also be accomplished using the Cloud Console or the gcloud CLI. - -`list_clusters.py` is a simple command-line program to demonstrate connecting to the Cloud Dataproc API and listing the clusters in a region. - -`submit_job_to_cluster.py` demonstrates how to create a cluster, submit the -`pyspark_sort.py` job, download the output from Google Cloud Storage, and output the result. - -`single_job_workflow.py` uses the Cloud Dataproc InstantiateInlineWorkflowTemplate API to create an ephemeral cluster, run a job, then delete the cluster with one API request. - -`pyspark_sort.py_gcs` is the same as `pyspark_sort.py` but demonstrates - reading from a GCS bucket. - -## Prerequisites to run locally: - -* [pip](https://pypi.python.org/pypi/pip) - -Go to the [Google Cloud Console](https://console.cloud.google.com). - -Under API Manager, search for the Google Cloud Dataproc API and enable it. - -## Set Up Your Local Dev Environment - -To install, run the following commands. If you want to use [virtualenv](https://virtualenv.readthedocs.org/en/latest/) -(recommended), run the commands within a virtualenv. - - * pip install -r requirements.txt - -## Authentication - -Please see the [Google cloud authentication guide](https://cloud.google.com/docs/authentication/). -The recommended approach to running these samples is a Service Account with a JSON key. - -## Environment Variables - -Set the following environment variables: - - GOOGLE_CLOUD_PROJECT=your-project-id - REGION=us-central1 # or your region - CLUSTER_NAME=waprin-spark7 - ZONE=us-central1-b - -## Running the samples - -To run list_clusters.py: - - python list_clusters.py $GOOGLE_CLOUD_PROJECT --region=$REGION - -`submit_job_to_cluster.py` can create the Dataproc cluster or use an existing cluster. To create a cluster before running the code, you can use the [Cloud Console](console.cloud.google.com) or run: - - gcloud dataproc clusters create your-cluster-name - -To run submit_job_to_cluster.py, first create a GCS bucket (used by Cloud Dataproc to stage files) from the Cloud Console or with gsutil: - - gsutil mb gs:// - -Next, set the following environment variables: - - BUCKET=your-staging-bucket - CLUSTER=your-cluster-name - -Then, if you want to use an existing cluster, run: - - python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET - -Alternatively, to create a new cluster, which will be deleted at the end of the job, run: - - python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET --create_new_cluster - -The script will setup a cluster, upload the PySpark file, submit the job, print the result, then, if it created the cluster, delete the cluster. - -Optionally, you can add the `--pyspark_file` argument to change from the default `pyspark_sort.py` included in this script to a new script. +https://github.com/googleapis/python-dataproc/tree/master/samples \ No newline at end of file diff --git a/dataproc/create_cluster.py b/dataproc/create_cluster.py deleted file mode 100644 index b4d63d2e13f..00000000000 --- a/dataproc/create_cluster.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This sample walks a user through creating a Cloud Dataproc cluster using -# the Python client library. -# -# This script can be run on its own: -# python create_cluster.py ${PROJECT_ID} ${REGION} ${CLUSTER_NAME} - - -import sys - -# [START dataproc_create_cluster] -from google.cloud import dataproc_v1 as dataproc - - -def create_cluster(project_id, region, cluster_name): - """This sample walks a user through creating a Cloud Dataproc cluster - using the Python client library. - - Args: - project_id (string): Project to use for creating resources. - region (string): Region where the resources should live. - cluster_name (string): Name to use for creating a cluster. - """ - - # Create a client with the endpoint set to the desired cluster region. - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': f'{region}-dataproc.googleapis.com:443', - }) - - # Create the cluster config. - cluster = { - 'project_id': project_id, - 'cluster_name': cluster_name, - 'config': { - 'master_config': { - 'num_instances': 1, - 'machine_type_uri': 'n1-standard-1' - }, - 'worker_config': { - 'num_instances': 2, - 'machine_type_uri': 'n1-standard-1' - } - } - } - - # Create the cluster. - operation = cluster_client.create_cluster(project_id, region, cluster) - result = operation.result() - - # Output a success message. - print(f'Cluster created successfully: {result.cluster_name}') - # [END dataproc_create_cluster] - - -if __name__ == "__main__": - if len(sys.argv) < 4: - sys.exit('python create_cluster.py project_id region cluster_name') - - project_id = sys.argv[1] - region = sys.argv[2] - cluster_name = sys.argv[3] - create_cluster(project_id, region, cluster_name) diff --git a/dataproc/create_cluster_test.py b/dataproc/create_cluster_test.py deleted file mode 100644 index 6b1d6806100..00000000000 --- a/dataproc/create_cluster_test.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid - -from google.cloud import dataproc_v1 as dataproc -import pytest - -import create_cluster - - -PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT'] -REGION = 'us-central1' -CLUSTER_NAME = 'py-cc-test-{}'.format(str(uuid.uuid4())) - - -@pytest.fixture(autouse=True) -def teardown(): - yield - - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' - }) - # Client library function - operation = cluster_client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME) - # Wait for cluster to delete - operation.result() - - -def test_cluster_create(capsys): - # Wrapper function for client library function - create_cluster.create_cluster(PROJECT_ID, REGION, CLUSTER_NAME) - - out, _ = capsys.readouterr() - assert CLUSTER_NAME in out diff --git a/dataproc/dataproc_e2e_donttest.py b/dataproc/dataproc_e2e_donttest.py deleted file mode 100644 index 44cc03bfd42..00000000000 --- a/dataproc/dataproc_e2e_donttest.py +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Integration tests for Dataproc samples. - -Creates a Dataproc cluster, uploads a pyspark file to Google Cloud Storage, -submits a job to Dataproc that runs the pyspark file, then downloads -the output logs from Cloud Storage and verifies the expected output.""" - -import os - -import submit_job_to_cluster - -PROJECT = os.environ['GOOGLE_CLOUD_PROJECT'] -BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] -CLUSTER_NAME = 'testcluster3' -ZONE = 'us-central1-b' - - -def test_e2e(): - output = submit_job_to_cluster.main( - PROJECT, ZONE, CLUSTER_NAME, BUCKET) - assert b"['Hello,', 'dog', 'elephant', 'panther', 'world!']" in output diff --git a/dataproc/instantiate_inline_workflow_template.py b/dataproc/instantiate_inline_workflow_template.py deleted file mode 100644 index f9358376f9f..00000000000 --- a/dataproc/instantiate_inline_workflow_template.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This sample walks a user through instantiating an inline -# workflow for Cloud Dataproc using the Python client library. -# -# This script can be run on its own: -# python instantiate_inline_workflow_template.py ${PROJECT_ID} ${REGION} - - -import sys - -# [START dataproc_instantiate_inline_workflow_template] -from google.cloud import dataproc_v1 as dataproc - - -def instantiate_inline_workflow_template(project_id, region): - """This sample walks a user through submitting a workflow - for a Cloud Dataproc using the Python client library. - - Args: - project_id (string): Project to use for running the workflow. - region (string): Region where the workflow resources should live. - """ - - # Create a client with the endpoint set to the desired region. - workflow_template_client = dataproc.WorkflowTemplateServiceClient( - client_options={ - 'api_endpoint': f'{region}-dataproc.googleapis.com:443' - } - ) - - parent = workflow_template_client.region_path(project_id, region) - - template = { - 'jobs': [ - { - 'hadoop_job': { - 'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/' - 'hadoop-mapreduce-examples.jar', - 'args': [ - 'teragen', - '1000', - 'hdfs:///gen/' - ] - }, - 'step_id': 'teragen' - }, - { - 'hadoop_job': { - 'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/' - 'hadoop-mapreduce-examples.jar', - 'args': [ - 'terasort', - 'hdfs:///gen/', - 'hdfs:///sort/' - ] - }, - 'step_id': 'terasort', - 'prerequisite_step_ids': [ - 'teragen' - ] - }], - 'placement': { - 'managed_cluster': { - 'cluster_name': 'my-managed-cluster', - 'config': { - 'gce_cluster_config': { - # Leave 'zone_uri' empty for 'Auto Zone Placement' - # 'zone_uri': '' - 'zone_uri': 'us-central1-a' - } - } - } - } - } - - # Submit the request to instantiate the workflow from an inline template. - operation = workflow_template_client.instantiate_inline_workflow_template( - parent, template - ) - operation.result() - - # Output a success message. - print('Workflow ran successfully.') - # [END dataproc_instantiate_inline_workflow_template] - - -if __name__ == "__main__": - if len(sys.argv) < 3: - sys.exit('python instantiate_inline_workflow_template.py ' - + 'project_id region') - - project_id = sys.argv[1] - region = sys.argv[2] - instantiate_inline_workflow_template(project_id, region) diff --git a/dataproc/instantiate_inline_workflow_template_test.py b/dataproc/instantiate_inline_workflow_template_test.py deleted file mode 100644 index 22673e4ee08..00000000000 --- a/dataproc/instantiate_inline_workflow_template_test.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import instantiate_inline_workflow_template - - -PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT'] -REGION = 'us-central1' - - -def test_workflows(capsys): - # Wrapper function for client library function - instantiate_inline_workflow_template.instantiate_inline_workflow_template( - PROJECT_ID, REGION - ) - - out, _ = capsys.readouterr() - assert "successfully" in out diff --git a/dataproc/list_clusters.py b/dataproc/list_clusters.py deleted file mode 100644 index 1639c413468..00000000000 --- a/dataproc/list_clusters.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Sample command-line program to list Cloud Dataproc clusters in a region. - -Example usage: -python list_clusters.py --project_id=my-project-id --region=global - -""" -import argparse - -from google.cloud import dataproc_v1 -from google.cloud.dataproc_v1.gapic.transports import ( - cluster_controller_grpc_transport) - - -# [START dataproc_list_clusters] -def list_clusters(dataproc, project, region): - """List the details of clusters in the region.""" - for cluster in dataproc.list_clusters(project, region): - print(('{} - {}'.format(cluster.cluster_name, - cluster.status.State.Name( - cluster.status.state)))) -# [END dataproc_list_clusters] - - -def main(project_id, region): - - if region == 'global': - # Use the default gRPC global endpoints. - dataproc_cluster_client = dataproc_v1.ClusterControllerClient() - else: - # Use a regional gRPC endpoint. See: - # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints - client_transport = ( - cluster_controller_grpc_transport.ClusterControllerGrpcTransport( - address='{}-dataproc.googleapis.com:443'.format(region))) - dataproc_cluster_client = dataproc_v1.ClusterControllerClient( - client_transport) - - list_clusters(dataproc_cluster_client, project_id, region) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=( - argparse.RawDescriptionHelpFormatter)) - parser.add_argument( - '--project_id', help='Project ID to access.', required=True) - parser.add_argument( - '--region', help='Region of clusters to list.', required=True) - - args = parser.parse_args() - main(args.project_id, args.region) diff --git a/dataproc/pyspark_sort.py b/dataproc/pyspark_sort.py deleted file mode 100644 index 0ce2350ad02..00000000000 --- a/dataproc/pyspark_sort.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Sample pyspark script to be uploaded to Cloud Storage and run on -Cloud Dataproc. - -Note this file is not intended to be run directly, but run inside a PySpark -environment. -""" - -# [START dataproc_pyspark_sort] -import pyspark - -sc = pyspark.SparkContext() -rdd = sc.parallelize(['Hello,', 'world!', 'dog', 'elephant', 'panther']) -words = sorted(rdd.collect()) -print(words) -# [END dataproc_pyspark_sort] diff --git a/dataproc/pyspark_sort_gcs.py b/dataproc/pyspark_sort_gcs.py deleted file mode 100644 index f1961c378d3..00000000000 --- a/dataproc/pyspark_sort_gcs.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Sample pyspark script to be uploaded to Cloud Storage and run on -Cloud Dataproc. - -Note this file is not intended to be run directly, but run inside a PySpark -environment. - -This file demonstrates how to read from a GCS bucket. See README.md for more -information. -""" - -# [START dataproc_pyspark_sort_gcs] -import pyspark - -sc = pyspark.SparkContext() -rdd = sc.textFile('gs://path-to-your-GCS-file') -print(sorted(rdd.collect())) -# [END dataproc_pyspark_sort_gcs] diff --git a/dataproc/python-api-walkthrough.md b/dataproc/python-api-walkthrough.md deleted file mode 100644 index 1a8d436f720..00000000000 --- a/dataproc/python-api-walkthrough.md +++ /dev/null @@ -1,170 +0,0 @@ -# Use the Python Client Library to call Dataproc APIs - -Estimated completion time: - -## Overview - -This [Cloud Shell](https://cloud.google.com/shell/docs/) walkthrough leads you -through the steps to use the -[Google Cloud Client Libraries for Python](https://googleapis.github.io/google-cloud-python/latest/dataproc/index.html) -to programmatically interact with [Dataproc](https://cloud.google.com/dataproc/docs/). - -As you follow this walkthrough, you run Python code that calls -[Dataproc gRPC APIs](https://cloud.google.com/dataproc/docs/reference/rpc/) -to: - -* create a Dataproc cluster -* submit a small PySpark word sort job to run on the cluster -* get job status -* tear down the cluster after job completion - -## Using the walkthrough - -The `submit_job_to_cluster.py file` used in this walkthrough is opened in the -Cloud Shell editor when you launch the walkthrough. You can view -the code as your follow the walkthrough steps. - -**For more information**: See [Dataproc→Use the Python Client Library](https://cloud.google.com/dataproc/docs/tutorials/python-library-example) for -an explanation of how the code works. - -**To reload this walkthrough:** Run the following command from the -`~/python-docs-samples/dataproc` directory in Cloud Shell: - - cloudshell launch-tutorial python-api-walkthrough.md - -**To copy and run commands**: Click the "Paste in Cloud Shell" button - () - on the side of a code box, then press `Enter` to run the command. - -## Prerequisites (1) - -1. Create or select a Google Cloud Platform project to use for this tutorial. - * - -1. Click the link below to enable the Dataproc, Compute Engine, and Cloud Storage APIs - in a separate GCP console tab in your browser. - - **Note:** After you select your project and enable the APIs, return to this tutorial by clicking - on the **Cloud Shell** tab in your browser. - - * [Enable APIs](https://console.cloud.google.com/flows/enableapi?apiid=dataproc,compute_component,storage-component.googleapis.com&redirect=https://console.cloud.google.com) - -## Prerequisites (2) - -1. This walkthrough uploads a PySpark file (`pyspark_sort.py`) to a - [Cloud Storage bucket](https://cloud.google.com/storage/docs/key-terms#buckets) in - your project. - * You can use the [Cloud Storage browser page](https://console.cloud.google.com/storage/browser) - in Google Cloud Platform Console to view existing buckets in your project. - -     **OR** - - * To create a new bucket, run the following command. Your bucket name must be unique. - ```bash - gsutil mb -p {{project-id}} gs://your-bucket-name - ``` - -1. Set environment variables. - - * Set the name of your bucket. - ```bash - BUCKET=your-bucket-name - ``` - -## Prerequisites (3) - -1. Set up a Python - [virtual environment](https://virtualenv.readthedocs.org/en/latest/) - in Cloud Shell. - - * Create the virtual environment. - ```bash - virtualenv ENV - ``` - * Activate the virtual environment. - ```bash - source ENV/bin/activate - ``` - -1. Install library dependencies in Cloud Shell. - ```bash - pip install -r requirements.txt - ``` - -## Create a cluster and submit a job - -1. Set a name for your new cluster. - ```bash - CLUSTER=new-cluster-name - ``` - -1. Set a [zone](https://cloud.google.com/compute/docs/regions-zones/#available) - where your new cluster will be located. You can change the - "us-central1-a" zone that is pre-set in the following command. - ```bash - ZONE=us-central1-a - ``` - -1. Run `submit_job.py` with the `--create_new_cluster` flag - to create a new cluster and submit the `pyspark_sort.py` job - to the cluster. - - ```bash - python submit_job_to_cluster.py \ - --project_id={{project-id}} \ - --cluster_name=$CLUSTER \ - --zone=$ZONE \ - --gcs_bucket=$BUCKET \ - --create_new_cluster - ``` - -## Job Output - -Job output in Cloud Shell shows cluster creation, job submission, - job completion, and then tear-down of the cluster. - - ... - Creating cluster... - Cluster created. - Uploading pyspark file to Cloud Storage. - new-cluster-name - RUNNING - Submitted job ID ... - Waiting for job to finish... - Job finished. - Downloading output file - ..... - ['Hello,', 'dog', 'elephant', 'panther', 'world!'] - ... - Tearing down cluster - ``` -## Congratulations on Completing the Walkthrough! - - ---- - -### Next Steps: - -* **View job details from the Console.** View job details by selecting the - PySpark job from the Dataproc -= - [Jobs page](https://console.cloud.google.com/dataproc/jobs) - in the Google Cloud Platform Console. - -* **Delete resources used in the walkthrough.** - The `submit_job_to_cluster.py` job deletes the cluster that it created for this - walkthrough. - - If you created a bucket to use for this walkthrough, - you can run the following command to delete the - Cloud Storage bucket (the bucket must be empty). - ```bash - gsutil rb gs://$BUCKET - ``` - You can run the following command to delete the bucket **and all - objects within it. Note: the deleted objects cannot be recovered.** - ```bash - gsutil rm -r gs://$BUCKET - ``` - -* **For more information.** See the [Dataproc documentation](https://cloud.google.com/dataproc/docs/) - for API reference and product feature information. diff --git a/dataproc/quickstart/quickstart.py b/dataproc/quickstart/quickstart.py deleted file mode 100644 index 4159e281520..00000000000 --- a/dataproc/quickstart/quickstart.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START dataproc_quickstart] -""" -This quickstart sample walks a user through creating a Cloud Dataproc -cluster, submitting a PySpark job from Google Cloud Storage to the -cluster, reading the output of the job and deleting the cluster, all -using the Python client library. - -Usage: - python quickstart.py --project_id --region \ - --cluster_name --job_file_path -""" - -import argparse -import time - -from google.cloud import dataproc_v1 as dataproc -from google.cloud import storage - - -def quickstart(project_id, region, cluster_name, job_file_path): - # Create the cluster client. - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) - }) - - # Create the cluster config. - cluster = { - 'project_id': project_id, - 'cluster_name': cluster_name, - 'config': { - 'master_config': { - 'num_instances': 1, - 'machine_type_uri': 'n1-standard-1' - }, - 'worker_config': { - 'num_instances': 2, - 'machine_type_uri': 'n1-standard-1' - } - } - } - - # Create the cluster. - operation = cluster_client.create_cluster(project_id, region, cluster) - result = operation.result() - - print('Cluster created successfully: {}'.format(result.cluster_name)) - - # Create the job client. - job_client = dataproc.JobControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) - }) - - # Create the job config. - job = { - 'placement': { - 'cluster_name': cluster_name - }, - 'pyspark_job': { - 'main_python_file_uri': job_file_path - } - } - - job_response = job_client.submit_job(project_id, region, job) - job_id = job_response.reference.job_id - - print('Submitted job \"{}\".'.format(job_id)) - - # Termimal states for a job. - terminal_states = { - dataproc.types.JobStatus.ERROR, - dataproc.types.JobStatus.CANCELLED, - dataproc.types.JobStatus.DONE - } - - # Create a timeout such that the job gets cancelled if not in a - # terminal state after a fixed period of time. - timeout_seconds = 600 - time_start = time.time() - - # Wait for the job to complete. - while job_response.status.state not in terminal_states: - if time.time() > time_start + timeout_seconds: - job_client.cancel_job(project_id, region, job_id) - print('Job {} timed out after threshold of {} seconds.'.format( - job_id, timeout_seconds)) - - # Poll for job termination once a second. - time.sleep(1) - job_response = job_client.get_job(project_id, region, job_id) - - # Cloud Dataproc job output gets saved to a GCS bucket allocated to it. - cluster_info = cluster_client.get_cluster( - project_id, region, cluster_name) - - storage_client = storage.Client() - bucket = storage_client.get_bucket(cluster_info.config.config_bucket) - output_blob = ( - 'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000' - .format(cluster_info.cluster_uuid, job_id)) - output = bucket.blob(output_blob).download_as_string() - - print('Job {} finished with state {}:\n{}'.format( - job_id, - job_response.status.State.Name(job_response.status.state), - output)) - - # Delete the cluster once the job has terminated. - operation = cluster_client.delete_cluster(project_id, region, cluster_name) - operation.result() - - print('Cluster {} successfully deleted.'.format(cluster_name)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - '--project_id', - type=str, - required=True, - help='Project to use for creating resources.') - parser.add_argument( - '--region', - type=str, - required=True, - help='Region where the resources should live.') - parser.add_argument( - '--cluster_name', - type=str, - required=True, - help='Name to use for creating a cluster.') - parser.add_argument( - '--job_file_path', - type=str, - required=True, - help='Job in GCS to execute against the cluster.') - - args = parser.parse_args() - quickstart(args.project_id, args.region, - args.cluster_name, args.job_file_path) -# [END dataproc_quickstart] diff --git a/dataproc/quickstart/quickstart_test.py b/dataproc/quickstart/quickstart_test.py deleted file mode 100644 index 3e17f6fa3e5..00000000000 --- a/dataproc/quickstart/quickstart_test.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import uuid - -from google.cloud import dataproc_v1 as dataproc -from google.cloud import storage -import pytest - -import quickstart - - -PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT'] -REGION = 'us-central1' -CLUSTER_NAME = 'py-qs-test-{}'.format(str(uuid.uuid4())) -STAGING_BUCKET = 'py-dataproc-qs-bucket-{}'.format(str(uuid.uuid4())) -JOB_FILE_NAME = 'sum.py' -JOB_FILE_PATH = 'gs://{}/{}'.format(STAGING_BUCKET, JOB_FILE_NAME) -SORT_CODE = ( - "import pyspark\n" - "sc = pyspark.SparkContext()\n" - "rdd = sc.parallelize((1,2,3,4,5))\n" - "sum = rdd.reduce(lambda x, y: x + y)\n" -) - - -@pytest.fixture(autouse=True) -def setup_teardown(): - storage_client = storage.Client() - bucket = storage_client.create_bucket(STAGING_BUCKET) - blob = bucket.blob(JOB_FILE_NAME) - blob.upload_from_string(SORT_CODE) - - yield - - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) - }) - - # The quickstart sample deletes the cluster, but if the test fails - # before cluster deletion occurs, it can be manually deleted here. - clusters = cluster_client.list_clusters(PROJECT_ID, REGION) - - for cluster in clusters: - if cluster.cluster_name == CLUSTER_NAME: - cluster_client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME) - - blob.delete() - bucket.delete() - - -def test_quickstart(capsys): - quickstart.quickstart(PROJECT_ID, REGION, CLUSTER_NAME, JOB_FILE_PATH) - out, _ = capsys.readouterr() - - assert 'Cluster created successfully' in out - assert 'Submitted job' in out - assert 'finished with state DONE:' in out - assert 'successfully deleted' in out diff --git a/dataproc/requirements-test.txt b/dataproc/requirements-test.txt deleted file mode 100644 index 7e460c8c866..00000000000 --- a/dataproc/requirements-test.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==6.0.1 diff --git a/dataproc/requirements.txt b/dataproc/requirements.txt deleted file mode 100644 index 5dd9b1d8b1c..00000000000 --- a/dataproc/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -grpcio==1.31.0 -google-auth==1.20.1 -google-auth-httplib2==0.0.4 -google-cloud==0.34.0 -google-cloud-storage==1.30.0 -google-cloud-dataproc==1.1.1 diff --git a/dataproc/single_job_workflow.py b/dataproc/single_job_workflow.py deleted file mode 100644 index b2754b06c1e..00000000000 --- a/dataproc/single_job_workflow.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -r"""Sample Cloud Dataproc inline workflow to run a pyspark job on an ephermeral -cluster. -Example Usage to run the inline workflow on a managed cluster: -python single_job_workflow.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ - --cluster_name=$CLUSTER --zone=$ZONE -Example Usage to run the inline workflow on a global region managed cluster: -python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ - --cluster_name=$CLUSTER --zone=$ZONE --global_region -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import os - -from google.cloud import dataproc_v1 -from google.cloud import storage -from google.cloud.dataproc_v1.gapic.transports import ( - workflow_template_service_grpc_transport) - - -DEFAULT_FILENAME = "pyspark_sort.py" -waiting_callback = False - - -def get_pyspark_file(pyspark_file=None): - if pyspark_file: - f = open(pyspark_file, "rb") - return f, os.path.basename(pyspark_file) - else: - """Gets the PySpark file from current directory.""" - current_dir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(current_dir, DEFAULT_FILENAME), "rb") - return f, DEFAULT_FILENAME - - -def get_region_from_zone(zone): - try: - region_as_list = zone.split("-")[:-1] - return "-".join(region_as_list) - except (AttributeError, IndexError, ValueError): - raise ValueError("Invalid zone provided, please check your input.") - - -def upload_pyspark_file(project, bucket_name, filename, spark_file): - """Uploads the PySpark file in this directory to the configured input - bucket.""" - print("Uploading pyspark file to Cloud Storage.") - client = storage.Client(project=project) - bucket = client.get_bucket(bucket_name) - blob = bucket.blob(filename) - blob.upload_from_file(spark_file) - - -def run_workflow(dataproc, project, region, zone, bucket_name, filename, - cluster_name): - - parent = "projects/{}/regions/{}".format(project, region) - zone_uri = ("https://www.googleapis.com/compute/v1/projects/{}/zones/{}" - .format(project, zone)) - - workflow_data = { - "placement": { - "managed_cluster": { - "cluster_name": cluster_name, - "config": { - "gce_cluster_config": {"zone_uri": zone_uri}, - "master_config": { - "num_instances": 1, - "machine_type_uri": "n1-standard-1", - }, - "worker_config": { - "num_instances": 2, - "machine_type_uri": "n1-standard-1", - }, - }, - } - }, - "jobs": [ - { - "pyspark_job": { - "main_python_file_uri": "gs://{}/{}".format( - bucket_name, filename) - }, - "step_id": "pyspark-job", - } - ], - } - - workflow = dataproc.instantiate_inline_workflow_template(parent, - workflow_data) - - workflow.add_done_callback(callback) - global waiting_callback - waiting_callback = True - - -def callback(operation_future): - # Reset global when callback returns. - global waiting_callback - waiting_callback = False - - -def wait_for_workflow_end(): - """Wait for cluster creation.""" - print("Waiting for workflow completion ...") - print("Workflow and job progress, and job driver output available from: " - "https://console.cloud.google.com/dataproc/workflows/") - - while True: - if not waiting_callback: - print("Workflow completed.") - break - - -def main( - project_id, - zone, - cluster_name, - bucket_name, - pyspark_file=None, - create_new_cluster=True, - global_region=True, -): - - # [START dataproc_get_workflow_template_client] - if global_region: - region = "global" - # Use the default gRPC global endpoints. - dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient() - else: - region = get_region_from_zone(zone) - # Use a regional gRPC endpoint. See: - # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints - client_transport = (workflow_template_service_grpc_transport - .WorkflowTemplateServiceGrpcTransport( - address="{}-dataproc.googleapis.com:443" - .format(region))) - dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient( - client_transport - ) - # [END dataproc_get_workflow_template_client] - - try: - spark_file, spark_filename = get_pyspark_file(pyspark_file) - upload_pyspark_file(project_id, bucket_name, spark_filename, - spark_file) - - run_workflow( - dataproc_workflow_client, - project_id, - region, - zone, - bucket_name, - spark_filename, - cluster_name - ) - wait_for_workflow_end() - - finally: - spark_file.close() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=__doc__, formatter_class=(argparse - .RawDescriptionHelpFormatter)) - parser.add_argument( - "--project_id", help="Project ID you want to access.", required=True - ) - parser.add_argument( - "--zone", help="Zone to create clusters in/connect to", required=True - ) - parser.add_argument( - "--cluster_name", help="Name of the cluster to create/connect to", - required=True - ) - parser.add_argument( - "--gcs_bucket", help="Bucket to upload Pyspark file to", required=True - ) - parser.add_argument( - "--pyspark_file", help="Pyspark filename. Defaults to pyspark_sort.py" - ) - parser.add_argument("--global_region", - action="store_true", - help="If cluster is in the global region") - - args = parser.parse_args() - main( - args.project_id, - args.zone, - args.cluster_name, - args.gcs_bucket, - args.pyspark_file, - ) diff --git a/dataproc/submit_job_to_cluster.py b/dataproc/submit_job_to_cluster.py deleted file mode 100644 index 389cbec87aa..00000000000 --- a/dataproc/submit_job_to_cluster.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -r"""Sample command-line program to run a pyspark job on a new or existing -cluster. - -Global region clusters are supported with --global_region flag. - -Example Usage to run the pyspark job on a new cluster: -python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ - --create_new_cluster --cluster_name=$CLUSTER --zone=$ZONE - -Example Usage to run the pyspark job on an existing global region cluster: -python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ - --global_region --cluster_name=$CLUSTER --zone=$ZONE - -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import os - -from google.cloud import dataproc_v1 -from google.cloud import storage -from google.cloud.dataproc_v1.gapic.transports import ( - cluster_controller_grpc_transport) -from google.cloud.dataproc_v1.gapic.transports import ( - job_controller_grpc_transport) - - -DEFAULT_FILENAME = 'pyspark_sort.py' -waiting_callback = False - - -def get_pyspark_file(pyspark_file=None): - if pyspark_file: - f = open(pyspark_file, "rb") - return f, os.path.basename(pyspark_file) - else: - """Gets the PySpark file from current directory.""" - current_dir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(current_dir, DEFAULT_FILENAME), "rb") - return f, DEFAULT_FILENAME - - -def get_region_from_zone(zone): - try: - region_as_list = zone.split('-')[:-1] - return '-'.join(region_as_list) - except (AttributeError, IndexError, ValueError): - raise ValueError('Invalid zone provided, please check your input.') - - -def upload_pyspark_file(project, bucket_name, filename, spark_file): - """Uploads the PySpark file in this directory to the configured input - bucket.""" - print('Uploading pyspark file to Cloud Storage.') - client = storage.Client(project=project) - bucket = client.get_bucket(bucket_name) - blob = bucket.blob(filename) - blob.upload_from_file(spark_file) - - -def download_output(project, cluster_id, output_bucket, job_id): - """Downloads the output file from Cloud Storage and returns it as a - string.""" - print('Downloading output file.') - client = storage.Client(project=project) - bucket = client.get_bucket(output_bucket) - output_blob = ( - ('google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'. - format(cluster_id, job_id))) - return bucket.blob(output_blob).download_as_string() - - -# [START dataproc_create_cluster] -def create_cluster(dataproc, project, zone, region, cluster_name): - """Create the cluster.""" - print('Creating cluster...') - zone_uri = \ - 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( - project, zone) - cluster_data = { - 'project_id': project, - 'cluster_name': cluster_name, - 'config': { - 'gce_cluster_config': { - 'zone_uri': zone_uri - }, - 'master_config': { - 'num_instances': 1, - 'machine_type_uri': 'n1-standard-1' - }, - 'worker_config': { - 'num_instances': 2, - 'machine_type_uri': 'n1-standard-1' - } - } - } - - cluster = dataproc.create_cluster(project, region, cluster_data) - cluster.add_done_callback(callback) - global waiting_callback - waiting_callback = True -# [END dataproc_create_cluster] - - -def callback(operation_future): - # Reset global when callback returns. - global waiting_callback - waiting_callback = False - - -def wait_for_cluster_creation(): - """Wait for cluster creation.""" - print('Waiting for cluster creation...') - - while True: - if not waiting_callback: - print("Cluster created.") - break - - -# [START dataproc_list_clusters_with_detail] -def list_clusters_with_details(dataproc, project, region): - """List the details of clusters in the region.""" - for cluster in dataproc.list_clusters(project, region): - print(('{} - {}'.format(cluster.cluster_name, - cluster.status.State.Name( - cluster.status.state)))) -# [END dataproc_list_clusters_with_detail] - - -def get_cluster_id_by_name(dataproc, project_id, region, cluster_name): - """Helper function to retrieve the ID and output bucket of a cluster by - name.""" - for cluster in dataproc.list_clusters(project_id, region): - if cluster.cluster_name == cluster_name: - return cluster.cluster_uuid, cluster.config.config_bucket - - -# [START dataproc_submit_pyspark_job] -def submit_pyspark_job(dataproc, project, region, cluster_name, bucket_name, - filename): - """Submit the Pyspark job to the cluster (assumes `filename` was uploaded - to `bucket_name.""" - job_details = { - 'placement': { - 'cluster_name': cluster_name - }, - 'pyspark_job': { - 'main_python_file_uri': 'gs://{}/{}'.format(bucket_name, filename) - } - } - - result = dataproc.submit_job( - project_id=project, region=region, job=job_details) - job_id = result.reference.job_id - print('Submitted job ID {}.'.format(job_id)) - return job_id -# [END dataproc_submit_pyspark_job] - - -# [START dataproc_delete] -def delete_cluster(dataproc, project, region, cluster): - """Delete the cluster.""" - print('Tearing down cluster.') - result = dataproc.delete_cluster( - project_id=project, region=region, cluster_name=cluster) - return result -# [END dataproc_delete] - - -# [START dataproc_wait] -def wait_for_job(dataproc, project, region, job_id): - """Wait for job to complete or error out.""" - print('Waiting for job to finish...') - while True: - job = dataproc.get_job(project, region, job_id) - # Handle exceptions - if job.status.State.Name(job.status.state) == 'ERROR': - raise Exception(job.status.details) - elif job.status.State.Name(job.status.state) == 'DONE': - print('Job finished.') - return job -# [END dataproc_wait] - - -def main(project_id, - zone, - cluster_name, - bucket_name, - pyspark_file=None, - create_new_cluster=True, - global_region=True): - - # [START dataproc_get_client] - if global_region: - region = 'global' - # Use the default gRPC global endpoints. - dataproc_cluster_client = dataproc_v1.ClusterControllerClient() - dataproc_job_client = dataproc_v1.JobControllerClient() - else: - region = get_region_from_zone(zone) - # Use a regional gRPC endpoint. See: - # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints - client_transport = ( - cluster_controller_grpc_transport.ClusterControllerGrpcTransport( - address='{}-dataproc.googleapis.com:443'.format(region))) - job_transport = ( - job_controller_grpc_transport.JobControllerGrpcTransport( - address='{}-dataproc.googleapis.com:443'.format(region))) - dataproc_cluster_client = dataproc_v1.ClusterControllerClient( - client_transport) - dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) - # [END dataproc_get_client] - - try: - spark_file, spark_filename = get_pyspark_file(pyspark_file) - if create_new_cluster: - create_cluster(dataproc_cluster_client, project_id, zone, region, - cluster_name) - wait_for_cluster_creation() - upload_pyspark_file(project_id, bucket_name, spark_filename, - spark_file) - - list_clusters_with_details(dataproc_cluster_client, project_id, - region) - - (cluster_id, output_bucket) = ( - get_cluster_id_by_name(dataproc_cluster_client, project_id, - region, cluster_name)) - - # [START dataproc_call_submit_pyspark_job] - job_id = submit_pyspark_job(dataproc_job_client, project_id, region, - cluster_name, bucket_name, spark_filename) - # [END dataproc_call_submit_pyspark_job] - - wait_for_job(dataproc_job_client, project_id, region, job_id) - output = download_output(project_id, cluster_id, output_bucket, job_id) - print('Received job output {}'.format(output)) - return output - finally: - if create_new_cluster: - delete_cluster(dataproc_cluster_client, project_id, region, - cluster_name) - spark_file.close() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description=__doc__, - formatter_class=argparse. - RawDescriptionHelpFormatter) - parser.add_argument( - '--project_id', help='Project ID you want to access.', required=True) - parser.add_argument('--zone', - help='Zone to create clusters in/connect to', - required=True) - parser.add_argument('--cluster_name', - help='Name of the cluster to create/connect to', - required=True) - parser.add_argument('--gcs_bucket', - help='Bucket to upload Pyspark file to', - required=True) - parser.add_argument('--pyspark_file', - help='Pyspark filename. Defaults to pyspark_sort.py') - parser.add_argument('--create_new_cluster', - action='store_true', - help='States if the cluster should be created') - parser.add_argument('--global_region', - action='store_true', - help='If cluster is in the global region') - - args = parser.parse_args() - main(args.project_id, args.zone, args.cluster_name, args.gcs_bucket, - args.pyspark_file, args.create_new_cluster, args.global_region)