From 7e0233d1a4dbb93ea5ed6dc5aba8c5c0d4419f77 Mon Sep 17 00:00:00 2001 From: Braun Date: Wed, 30 Aug 2023 16:14:10 -0500 Subject: [PATCH 1/7] docs(docs): adding github actions for deploying docs --- .github/workflows/deploy_docs.yml | 57 +++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 .github/workflows/deploy_docs.yml diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml new file mode 100644 index 00000000..d8352072 --- /dev/null +++ b/.github/workflows/deploy_docs.yml @@ -0,0 +1,57 @@ +# Simple workflow for deploying static content to GitHub Pages +name: Deploy static content to Pages + +on: + # Runs on pushes targeting the default branch + push: + branches: ["feature/docs"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Single deploy job since we're just deploying + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Install Python + uses: "actions/setup-python@v4" + with: + python-version: "3.10" + - name: Install Poetry + run: "curl -sSL https://install.python-poetry.org | python3 -" + - name: Update Path + run : echo "${HOME}/.local/bin" >> $GITHUB_PATH + - name: Set .venv Config + run : poetry config virtualenvs.in-project true + - name: Install Deps + run: "../common-utils/common_utils_build.sh && poetry install --with-only main,docs --no-root --no-ansi" + - name: Build Docs + run: "poetry run mkdocs build" + - name: Setup Pages + uses: actions/configure-pages@v3 + - name: Upload artifact + uses: actions/upload-pages-artifact@v2 + with: + # Upload entire repository + path: 'docs/site' + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 \ No newline at end of file From 4e55c2a40bc98b6994ebea7c4c7ee440b33c6714 Mon Sep 17 00:00:00 2001 From: Braun Date: Thu, 31 Aug 2023 10:30:45 -0500 Subject: [PATCH 2/7] ci(docs): updating working directory for gha --- .github/workflows/deploy_docs.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index d8352072..19951b51 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -21,6 +21,10 @@ concurrency: group: "pages" cancel-in-progress: false +defaults: + run: + working-directory: ./data-products + jobs: # Single deploy job since we're just deploying deploy: From 95d21f2b026a986ec033d3cbd325a9aa5bbbe4db Mon Sep 17 00:00:00 2001 From: Braun Date: Thu, 31 Aug 2023 10:33:10 -0500 Subject: [PATCH 3/7] ci(docs): fixing poetry install flag --- .github/workflows/deploy_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 19951b51..7bb4d969 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -46,7 +46,7 @@ jobs: - name: Set .venv Config run : poetry config virtualenvs.in-project true - name: Install Deps - run: "../common-utils/common_utils_build.sh && poetry install --with-only main,docs --no-root --no-ansi" + run: "../common-utils/common_utils_build.sh && poetry install --only main,docs --no-root --no-ansi" - name: Build Docs run: "poetry run mkdocs build" - name: Setup Pages From 0b70cca26cb93e38a9956e009bd4db2f908bef1f Mon Sep 17 00:00:00 2001 From: Braun Date: Thu, 31 Aug 2023 13:18:57 -0500 Subject: [PATCH 4/7] ci(docs): updating docs site to deploy using mkdocs cli --- .github/workflows/deploy_docs.yml | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 7bb4d969..1eec4ee6 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -1,5 +1,5 @@ # Simple workflow for deploying static content to GitHub Pages -name: Deploy static content to Pages +name: Deploy Docs Site to Pages on: # Runs on pushes targeting the default branch @@ -22,15 +22,12 @@ concurrency: cancel-in-progress: false defaults: - run: - working-directory: ./data-products + run: + working-directory: ./data-products jobs: # Single deploy job since we're just deploying deploy: - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest steps: - name: Checkout @@ -47,15 +44,5 @@ jobs: run : poetry config virtualenvs.in-project true - name: Install Deps run: "../common-utils/common_utils_build.sh && poetry install --only main,docs --no-root --no-ansi" - - name: Build Docs - run: "poetry run mkdocs build" - - name: Setup Pages - uses: actions/configure-pages@v3 - - name: Upload artifact - uses: actions/upload-pages-artifact@v2 - with: - # Upload entire repository - path: 'docs/site' - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v2 \ No newline at end of file + - name: Deploy Docs + run: "poetry run mkdocs gh-deploy" \ No newline at end of file From 8642f12c793551a1293636209c17dd9ade412b17 Mon Sep 17 00:00:00 2001 From: Braun Date: Thu, 31 Aug 2023 13:23:17 -0500 Subject: [PATCH 5/7] ci(docs): updating permissions allowing workflow to push to the github-pages branch --- .github/workflows/deploy_docs.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index 1eec4ee6..cabc8f51 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -11,9 +11,7 @@ on: # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: - contents: read - pages: write - id-token: write + contents: write # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. From a3212fa415957787bc0f025983fbd5ce99b701ff Mon Sep 17 00:00:00 2001 From: Braun Date: Thu, 31 Aug 2023 13:28:29 -0500 Subject: [PATCH 6/7] ci(docs): adding -force flag to mkdocs gh-deploy --- .github/workflows/deploy_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml index cabc8f51..7c69f924 100644 --- a/.github/workflows/deploy_docs.yml +++ b/.github/workflows/deploy_docs.yml @@ -43,4 +43,4 @@ jobs: - name: Install Deps run: "../common-utils/common_utils_build.sh && poetry install --only main,docs --no-root --no-ansi" - name: Deploy Docs - run: "poetry run mkdocs gh-deploy" \ No newline at end of file + run: "poetry run mkdocs gh-deploy --force" \ No newline at end of file From 2a8351211ae65e59abbd0c1ca3749dd449ccc673 Mon Sep 17 00:00:00 2001 From: Braun Date: Tue, 26 Sep 2023 09:55:25 -0500 Subject: [PATCH 7/7] docs(docs): adding shared utils to docs --- data-products/docs/flows/sql_etl/usage.md | 69 ++++++++++++++++++++--- data-products/docs/index.md | 3 +- data-products/docs/shared/utils/code.md | 1 + data-products/mkdocs.yml | 9 ++- 4 files changed, 72 insertions(+), 10 deletions(-) create mode 100644 data-products/docs/shared/utils/code.md diff --git a/data-products/docs/flows/sql_etl/usage.md b/data-products/docs/flows/sql_etl/usage.md index 84950915..11cb2dfb 100644 --- a/data-products/docs/flows/sql_etl/usage.md +++ b/data-products/docs/flows/sql_etl/usage.md @@ -16,13 +16,17 @@ Here is a flow diagram showing how it all works: To leverage this service for jobs, `sql_etl` you will need to know: +- How to create a deployment - How to create the SQL expected for things to work - How the offset and interval logic works - How to setup your local environment -- How to create a deployment Below we go through each element. +## Deployment + +The + ## SQL statments @@ -76,7 +80,7 @@ Notice that we can have up to three files possible in a folder: - `load.sql` - `offset.sql` -Also, notice that we support files liveing at the top level of a folder within the `sql` folder. We also support a single level of nesting files in subdirectories of a folder in `sql`. +Also, notice that we support files living at the top level of a folder within the `sql` folder. We also support a single level of nesting files in subdirectories of a folder in `sql`. The `curated_feed_exports_aurora` is an example of the supporting nesting of subdirectories. If, needed SQL files can live at the top of the parent directory and will executed after the subdirectories are completed. @@ -103,9 +107,11 @@ FROM TABLE_TO_BE_EXTRACTED; Now when the service renders and uses this extract statment, it knows where to run it. All configuration needed for connecting the engines we support is supplied via environment variables, which will be discussed in another section. -NOTE: All SQL running through this service must have a `sql_engine` block. +!!! note + All SQL running through this service must have a `sql_engine` block. -NOTE: For `snowflake` and `bigquery`, we will wrap the rendered `data.sql` in specific export logic located in this method: +!!! note + For `snowflake` and `bigquery`, we will wrap the rendered `data.sql` in specific export logic located in this method: [`get_extraction_sql`](/flows/sql_etl/code/#src.sql_etl.run_jobs_flow.SqlEtlJob.get_extraction_sql) @@ -113,6 +119,36 @@ NOTE: For `snowflake` and `bigquery`, we will wrap the rendered `data.sql` in sp A single `data.sql` file means you are just loading to cloud storage. + Since these files are being formatted by jijnja2, we can add our own custom keywords to do things like this: + +```sql + SELECT + * +FROM +{% if for_backfill %} + deduped_table +{% else %} + live_table +{% endif %} +WHERE submission_timestamp >= {{ helpers.parse_iso8601(batch_start) }} +AND submission_timestamp < {{ helpers.parse_iso8601(batch_end) }} +QUALIFY row_number() over (PARTITION BY DATE(submission_timestamp), +document_id +ORDER BY +submission_timestamp desc) = 1 +``` + +Also, since these are jinja2 templates we can import helpers using something like this toward the top of the file: + +`{% import 'helpers.j2' as helpers with context %}` + +This is what makes the `helpers.parse_iso8601` call possible in the last SQL example above. + +These parameters are available to all `data.sql` files: + +- The fields of the base `SQLJob` pydantic model described [here](/shared/utils/code/#src.shared.utils.SqlJob/). Your custom parameter values are defined as a dictionary in `kwargs`. +- The fields of the `SqlEtlJob` pydantic model described [here](/sql_etl/code/#src.sql_etl.run_jobs_flow.SqlEtlJob) + ### Loading Loading requires the existence of a `load.sql`. @@ -127,14 +163,15 @@ FROM @my_ext_stage FILE_FORMAT = (TYPE = 'PARQUET') ``` -NOTE: Currently all data exports from `snowflake` and `bigquery` will be in `PARQUET` format. +!!! note + Currently all data exports from `snowflake` and `bigquery` will be in `PARQUET` format. ### Offset This service provides the option to run either `incremental` or `non-incremental` extract-load jobs. -When you have the ability to pull the `last_offset` from the destination table on an incremental job, you will need to provide an `offset.sql` file. This activates the incremental logic. +When you have the need to produce a `last_offset` to make the flow run incremental, you will need to provide an `offset.sql` file. This activates the incremental logic. For example: ```sql @@ -143,7 +180,23 @@ select max(timestamp_field) as last_offset from TABLE_TO_BE_LOADED ``` -There is also the option to track offset using an external table called `sql_offset_state` with the `with_external_state` flag. If this flag is set, then `offset.sql` is not required to activate incremental logic. +Pretty much anything can be in the `offset.sql`. For example, if you want to pin the offset to the last 24 hours you could do something like this: + +```sql +{% set sql_engine = "snowflake" %} +select (trunc(sysdate()::timestamp, 'day') - interval '1 day') - interval '1 microsecond'; +``` + +If this is run on `2023-09-17 13:00`, this would result in a value of `2023-09-15 23:59:59.999999`. When the incremental logic built into the service add 1 microsecond, the `data.sql` file will have access to a batch_start timestamp of `2023-09-16 00:00:00.000000` for a proper `>=` where clause declaration. + +There is also the option to track offset using an external table called `sql_offset_state` with the `with_external_state` flag. If this flag is set, then `offset.sql` is not required to activate incremental logic because that value is pulled from the state table. + +!!! note + Using the `with_external_state` does require the use of the `initial_last_offset` job parameter. + +`with_external_state`, incremental logic, and job parameters will be discussed later on. + +### Intervals -`with_external_state` and the incremental logic will be discussed later on. +The interval logic in this diff --git a/data-products/docs/index.md b/data-products/docs/index.md index bb2e269d..04e0c204 100644 --- a/data-products/docs/index.md +++ b/data-products/docs/index.md @@ -13,7 +13,8 @@ Shared utilities live in `src/shared` can will be available to all flows. This project uses [Poetry](https://python-poetry.org/) for dependency management. We also use the (Poetry dotenv plugin)[https://github.com/volopivoshenko/poetry-plugin-dotenv] to apply environment variables at runtime through the use of a `.env` file that should live at the root of this project. -NOTE: If you get a dependency error on installion of the plug, you just need to update Poetry using `poetry self update`. +!!! note + If you get a dependency error on installion of the plug, you just need to update Poetry using `poetry self update`. These should be deployed using the details offered by each project's respective documentation. diff --git a/data-products/docs/shared/utils/code.md b/data-products/docs/shared/utils/code.md new file mode 100644 index 00000000..4bdeec4d --- /dev/null +++ b/data-products/docs/shared/utils/code.md @@ -0,0 +1 @@ +:::src.shared.utils \ No newline at end of file diff --git a/data-products/mkdocs.yml b/data-products/mkdocs.yml index 78d08328..d009e5bc 100644 --- a/data-products/mkdocs.yml +++ b/data-products/mkdocs.yml @@ -4,6 +4,10 @@ theme: nav: - Getting Started: index.md + - Utilities: + - shared: + - utils: + - Code: shared/utils/code.md - Flows: - sql_etl: - Usage: flows/sql_etl/usage.md @@ -24,4 +28,7 @@ plugins: show_bases: True show_signature: False heading_level: 1 - filters: ["!^_[^_]", "!^__[^__]"] \ No newline at end of file + filters: ["!^_[^_]", "!^__[^__]"] + +markdown_extensions: + - admonition \ No newline at end of file