goblet · anovis · Dec 1, 2023 · Nov 30, 2023 · Nov 30, 2023 · Nov 30, 2023
diff --git a/docs/source/infrastructures.rst b/docs/source/infrastructures.rst
@@ -140,4 +140,113 @@ PubSub Topics
     config = { ... }
     app.pubsub_topic("topic", config=config)
 
-To further configure your PubSub topic within Goblet, provide the config parameter base on the documentation. `Topic Resource <https://cloud.google.com/pubsub/docs/reference/rest/v1/projects.topic>`_.
+To further configure your PubSub topic within Goblet, provide the config parameter base on the documentation. `Topic Resource <https://cloud.google.com/pubsub/docs/reference/rest/v1/projects.topic>`_.
+
+BigQuery Spark Stored Procedures
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To deploy BigQuery stored procedures using Spark follow the example below. 
+BigQuery stored procedures documentation can be found `here <https://cloud.google.com/bigquery/docs/spark-procedures>`_.
+
+Using a function from the same python file:
+.. code:: python
+    import logging
+    from goblet import Goblet, goblet_entrypoint
+
+    app = Goblet(function_name="create-bq-spark-stored-procedure")
+
+    app.log.setLevel(logging.DEBUG)  # configure goblet logger level
+    goblet_entrypoint(app)
+
+    # Create a bq spark stored procedure with the spark code and additional python files
+    def spark_handler():
+        from pyspark.sql import SparkSession
+        import pyspark.sql.functions as F
+        spark = SparkSession.builder.appName("spark-bigquery-demo").getOrCreate()
+
+        # Load data from BigQuery.
+        texts = spark.read.format("bigquery") \
+        .option("table", "tutorial.poc") \
+        .load()
+        texts.createOrReplaceTempView("words")
+
+        # Perform word count.
+        text_count = texts.select("id", "text", F.length("text").alias("sum_text_count"))
+        text_count.show()
+        text_count.printSchema()
+
+        # Saving the data to BigQuery
+        text_count.write.mode("append").format("bigquery") \
+        .option("writeMethod", "direct") \
+        .save("tutorial.wordcount_output")
+
+    app.bqsparkstoredprocedure(
+        name="count_words_procedure_external",
+        dataset_id="tutorial",
+        func=spark_handler,
+    )
+
+Using a function from a different python file and loading additional python files:
+`spark.py`:
+.. code:: python
+    def spark_handler():
+        from pyspark.sql import SparkSession
+        import pyspark.sql.functions as F
+        spark = SparkSession.builder.appName("spark-bigquery-demo").getOrCreate()
+
+        # Load data from BigQuery.
+        texts = spark.read.format("bigquery") \
+        .option("table", "tutorial.poc") \
+        .load()
+        texts.createOrReplaceTempView("words")
+
+        # Perform word count.
+        text_count = texts.select("id", "text", F.length("text").alias("sum_text_count"))
+        text_count.show()
+        text_count.printSchema()
+
+        # Saving the data to BigQuery
+        text_count.write.mode("append").format("bigquery") \
+        .option("writeMethod", "direct") \
+        .save("tutorial.wordcount_output")
+
+    if __name__ == "__main__":
+        spark_handler()
+
+`additional.py`:
+.. code:: python
+    import logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+
+    def additional_func():
+        logger.info("additional_func")
+
+`main.py`:
+.. code:: python
+    import logging
+    from goblet import Goblet, goblet_entrypoint
+
+    app = Goblet(function_name="create-bq-spark-stored-procedure")
+
+    app.log.setLevel(logging.DEBUG)  # configure goblet logger level
+    goblet_entrypoint(app)
+
+    # Create a bq spark stored procedure with the spark code and additional python files
+    app.bqsparkstoredprocedure(
+        name="count_words_procedure_external",
+        dataset_id="tutorial",
+        spark_file="spark.py",
+        additional_python_files=["additional.py"],
+    )
+
+Options that can be passed to the `bqsparkstoredprocedure` method are:
+- name: name of resource
+- dataset_id: dataset id where the routine will be created
+- func (optional): function/method to be executed
+- runtime_version (optional): runtime version of the spark code
+- container_image (optional): container image to use
+- spark_file (optional): file from local path with the spark code
+- additional_python_files (optional): List of files from local path with additional code (Ex: libraries)
+- additional_files (optional): List of files from local path with additional files (Ex: csvs)
+- properties (optional): Dictionary with additional properties. `Supported properties <https://spark.apache.org/docs/latest/configuration.html#spark-properties>`_
diff --git a/examples/example_bq_spark_stored_procedure/README.md b/examples/example_bq_spark_stored_procedure/README.md
@@ -0,0 +1,10 @@
+# BigQuery Spark Stored Procedure Example
+This example demonstrates how to use Goblet to create a BigQuery stored procedure that uses Spark. https://cloud.google.com/bigquery/docs/spark-procedures
+
+## Running Example
+```bash
+# Run the example. This will create a topic and a subscription on the emulator.
+goblet deploy --skip-backend
+bq query --use_legacy_sql=false --destination_table=myDataset.myTable \
+    'CALL `project.myDataset.count_words_procedure_external`();'
+```
diff --git a/examples/example_bq_spark_stored_procedure/additional.py b/examples/example_bq_spark_stored_procedure/additional.py
@@ -0,0 +1,6 @@
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def additional_func():
+    logger.info("additional_func")
diff --git a/examples/example_bq_spark_stored_procedure/main.py b/examples/example_bq_spark_stored_procedure/main.py
@@ -0,0 +1,23 @@
+import logging
+from goblet import Goblet, goblet_entrypoint
+# from spark import spark_handler
+
+app = Goblet(function_name="create-bq-spark-stored-procedure")
+
+app.log.setLevel(logging.DEBUG)  # configure goblet logger level
+goblet_entrypoint(app)
+
+# Create a bq spark stored procedure with the spark code and additional python files
+app.bqsparkstoredprocedure(
+    name="count_words_procedure_external",
+    dataset_id="tutorial",
+    spark_file="spark.py",
+    additional_python_files=["additional.py"],
+)
+
+# Create a bq spark stored procedure with the spark code from the function
+# app.bqsparkstoredprocedure(
+#     name="count_words_procedure",
+#     dataset_id="tutorial",
+#     func=spark_handler,
+# )
diff --git a/examples/example_bq_spark_stored_procedure/requirements.txt b/examples/example_bq_spark_stored_procedure/requirements.txt
@@ -0,0 +1 @@
+pyspark==3.5.0
diff --git a/examples/example_bq_spark_stored_procedure/spark.py b/examples/example_bq_spark_stored_procedure/spark.py
@@ -0,0 +1,23 @@
+def spark_handler():
+    from pyspark.sql import SparkSession
+    import pyspark.sql.functions as F
+    spark = SparkSession.builder.appName("spark-bigquery-demo").getOrCreate()
+
+    # Load data from BigQuery.
+    texts = spark.read.format("bigquery") \
+    .option("table", "tutorial.poc") \
+    .load()
+    texts.createOrReplaceTempView("words")
+
+    # Perform word count.
+    text_count = texts.select("id", "text", F.length("text").alias("sum_text_count"))
+    text_count.show()
+    text_count.printSchema()
+
+    # Saving the data to BigQuery
+    text_count.write.mode("append").format("bigquery") \
+    .option("writeMethod", "direct") \
+    .save("tutorial.wordcount_output")
+
+if __name__ == "__main__":
+    spark_handler()
diff --git a/goblet/cli.py b/goblet/cli.py
@@ -34,6 +34,7 @@
     "pubsub",
     "redis",
     "vpcconnector",
+    "bqsparkstoredprocedure",
 ]
 
 

diff --git a/goblet/client.py b/goblet/client.py
@@ -24,6 +24,7 @@
     "iam": "v1",
     "cloudresourcemanager": "v3",
     "artifactregistry": "v1",
+    "storage": "v1",
 }
 
 
@@ -284,3 +285,19 @@ def artifactregistry_repositories(self):
             calls="projects.locations.repositories",
             parent_schema="projects/{project_id}/locations/{location_id}",
         )
+
+    @property
+    def storage_buckets(self):
+        return Client(
+            "storage",
+            self.client_versions.get("storage", "v1"),
+            calls="buckets",
+        )
+
+    @property
+    def storage_objects(self):
+        return Client(
+            "storage",
+            self.client_versions.get("storage", "v1"),
+            calls="objects",
+        )
diff --git a/goblet/decorators.py b/goblet/decorators.py
@@ -15,6 +15,9 @@
 from goblet.infrastructures.cloudtask import CloudTaskQueue
 from goblet.infrastructures.pubsub import PubSubTopic
 from goblet.infrastructures.alerts import PubSubDLQCondition
+from goblet.infrastructures.bq_spark_stored_procedure import (
+    BigQuerySparkStoredProcedure,
+)
 
 log = logging.getLogger(__name__)
 log.setLevel(logging.getLevelName(os.getenv("GOBLET_LOG_LEVEL", "INFO")))
@@ -43,6 +46,7 @@
     "vpcconnector": VPCConnector,
     "cloudtaskqueue": CloudTaskQueue,
     "pubsub_topic": PubSubTopic,
+    "bqsparkstoredprocedure": BigQuerySparkStoredProcedure,
 }
 
 
@@ -346,6 +350,49 @@ def vpcconnector(self, name, **kwargs):
             kwargs={"name": name, "kwargs": kwargs},
         )
 
+    def bqsparkstoredprocedure(
+        self,
+        name,
+        dataset_id,
+        runtime_version="1.1",
+        container_image=None,
+        func=None,
+        spark_file=None,
+        additional_python_files=None,
+        additional_files=None,
+        properties=None,
+        **kwargs,
+    ):
+        """
+        BigQuery Spark Stored Procedure trigger
+        :param name: name of resource
+        :param dataset_id: dataset id where the routine will be created
+        :param func (optional): function/method
+        :param runtime_version (optional): runtime version of the spark code
+        :param container_image (optional): container image to use
+        :param spark_file (optional): file from a local path with the spark code
+        :param additional_python_files (optional): List of files from a local path with additional code (Ex: libraries)
+        :param additional_files (optional): List of files from a local path with additional files (Ex: csvs)
+        :param properties (optional): Dictionary with additional properties. Supported properties: https://spark.apache.org/docs/latest/configuration.html#spark-properties
+        """
+        return self._register_infrastructure(
+            handler_type="bqsparkstoredprocedure",
+            kwargs={
+                "name": name,
+                "kwargs": {
+                    "dataset_id": dataset_id,
+                    "func": func,
+                    "runtime_version": runtime_version,
+                    "container_image": container_image,
+                    "spark_file": spark_file,
+                    "additional_python_files": additional_python_files,
+                    "additional_files": additional_files,
+                    "properties": properties,
+                    **kwargs,
+                },
+            },
+        )
+
     def errorhandler(self, error):
         def _register_error_handler(error_handler):
             self.error_handlers[error] = error_handler