From 08d4d4d69dbad7c3ff49aaa06c5cce807ab432ba Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 25 Jan 2025 19:20:04 -0600 Subject: [PATCH 01/27] added my patching example --- .gitignore | 1 + hello/hello_patch_grant_README.md | 99 +++++++++++++++++++++++++ hello/hello_patch_grant_run_worker.py | 57 ++++++++++++++ hello/hello_patch_grant_run_workflow.py | 29 ++++++++ 4 files changed, 186 insertions(+) create mode 100644 hello/hello_patch_grant_README.md create mode 100644 hello/hello_patch_grant_run_worker.py create mode 100644 hello/hello_patch_grant_run_workflow.py diff --git a/.gitignore b/.gitignore index 033df5fb..cf7ccef9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .venv __pycache__ +/.vscode/ diff --git a/hello/hello_patch_grant_README.md b/hello/hello_patch_grant_README.md new file mode 100644 index 00000000..7c8066fa --- /dev/null +++ b/hello/hello_patch_grant_README.md @@ -0,0 +1,99 @@ +# Hello Patch Tutorial + +This tutorial has two associated python files and +[a YouTube playlist](https://www.youtube.com/playlist?list=PLytZkHFJwKUdfxFQnuo0Fson0QM0VL9hL). + +The pastable snippets are here: + +```python + await asyncio.sleep(10) + print(f"v1val: {workflow.patched("v1")}") +``` + +## Video Explanations + +Each video had some explanation text at the top -- +that is pasted below: + +### Video 1: Not Replaying + +In this video, we'll begin to discuss the behavior +of the patched function. We'll start by +showing what it does when not replaying, and +in another video, we'll show what happens when replaying. + +- if not replaying, and the execution hits a call to patched, + it first checks the event history, and: + - if the patch ID is not in the event history, + it will add a marker to the + event history and upsert a search attribute + - if the patch ID is in the event history, + it won't modify the history +- In either case, it will return true + +> there is a caveat to the above, and +> we will get to that in a later video + +### Video 2: Replaying with Patch ID in the history + +In this video, we'll continue our discussion of the behavior +of the patched function. We'll +show what it does when replaying. + +- if replaying, and the code has a call to patched, + - if the patch ID is somewhere in the event history + - if the event is after where we currently are + in the event history, then, in other words, + our patch is before the + event, then our patch is too early. it will + attempt to write the marker to the replay event + history, but it will throw a non-deterministic + exception because the replay and original event + histories don't match + - if the event is before where we currently are + in the execution, and there hasn't been a call to patched + with that id yet, then it won't even get to this call + because an NDE would have already been thrown + - if the event is where we currently are in the + event history, it will return true and add a + marker to the replay event history (which means it + will match the original event history) and it will + continue. + *This is similar to the behavior of the non-replay case* + - if the event is before where we currently are + in the execution, meaning the replay already has seen + a call to patched with this ID, + it will return true and not + do anything to the replay event history + *This is similar to the behavior of the non-replay case* + - if the patch ID is not anywhere in the event history + - we will discuss in the next video + +### Video 3: Replaying with Patch ID not in the history + +In this video, we'll finish our discussion of the behavior +of the patched function. We'll +show what it does when replaying, and the patch ID +is nowhere in the event history. + +- if replaying, and the code has a call to patched, + - if the patch ID is somewhere in the event history + - ... + - if the patch ID is not anywhere in the event history + - it will return false and not add anything to + the event history. Furthermore, and this is the + caveat, it will make all future calls to patched + with that ID false -- even after it is done replaying + and is running normal code. + +Why is this a caveat? + +- in the first video, we said that if not replaying, + the patched function will always return true, and if + the marker doesn't exist, it will add it, and if + the marker already exists, it won't re-add it. + + But what this + is saying is that this doesn't hold if there was already + a call to patched with that ID in the replay code, but not + in the event history. diff --git a/hello/hello_patch_grant_run_worker.py b/hello/hello_patch_grant_run_worker.py new file mode 100644 index 00000000..e860a39c --- /dev/null +++ b/hello/hello_patch_grant_run_worker.py @@ -0,0 +1,57 @@ +""" +In this video, we'll finish our discussion of the behavior +of the patched function. We'll +show what it does when replaying, and the patch ID +is nowhere in the event history. + +- if replaying, and the code has a call to patched, + - if the patch ID is somewhere in the event history + - ... + - if the patch ID is not anywhere in the event history + - it will return false and not add anything to + the event history. Furthermore, and this is the + caveat, it will make all future calls to patched + with that ID false -- even after it is done replaying + and is running normal code. + + +Why is this a caveat? +- in the first video, we said that if not replaying, + the patched function will always return true, and if + the marker doesn't exist, it will add it, and if + the marker already exists, it won't re-add it. + + But what this + is saying is that this doesn't hold if there was already + a call to patched with that ID in the replay code, but not + in the event history. +""" + +import asyncio + +from temporalio import activity, workflow +from temporalio.client import Client +from temporalio.worker import Worker + +@workflow.defn() +class MyWorkflow: + @workflow.run + async def run(self) -> str: + + pass + +async def main(): + + client = await Client.connect("localhost:7233") + + worker = Worker( + client, + task_queue="hello-patch-task-queue", + workflows=[MyWorkflow], + ) + + await worker.run() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/hello/hello_patch_grant_run_workflow.py b/hello/hello_patch_grant_run_workflow.py new file mode 100644 index 00000000..9f995aca --- /dev/null +++ b/hello/hello_patch_grant_run_workflow.py @@ -0,0 +1,29 @@ +import asyncio +import sys +from dataclasses import dataclass +from datetime import timedelta + +from temporalio import activity, exceptions, workflow +from temporalio.client import Client +from temporalio.worker import Worker + +from hello.hello_patch_grant_run_worker import MyWorkflow + + +async def main(): + + # Uncomment the lines below to see logging output + # import logging + # logging.basicConfig(level=logging.INFO) + + client = await Client.connect("localhost:7233") + + await client.execute_workflow( + MyWorkflow.run, # type: ignore + id="hello-patch-workflow-id", + task_queue="hello-patch-task-queue", + ) + + +if __name__ == "__main__": + asyncio.run(main()) From bdd154be89de3aa72ef0aed512ba759fa2ae5db7 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 25 Jan 2025 23:41:38 -0600 Subject: [PATCH 02/27] small fix --- hello/hello_patch_grant_run_worker.py | 31 +-------------------------- 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/hello/hello_patch_grant_run_worker.py b/hello/hello_patch_grant_run_worker.py index e860a39c..0adc5cec 100644 --- a/hello/hello_patch_grant_run_worker.py +++ b/hello/hello_patch_grant_run_worker.py @@ -1,35 +1,6 @@ -""" -In this video, we'll finish our discussion of the behavior -of the patched function. We'll -show what it does when replaying, and the patch ID -is nowhere in the event history. - -- if replaying, and the code has a call to patched, - - if the patch ID is somewhere in the event history - - ... - - if the patch ID is not anywhere in the event history - - it will return false and not add anything to - the event history. Furthermore, and this is the - caveat, it will make all future calls to patched - with that ID false -- even after it is done replaying - and is running normal code. - - -Why is this a caveat? -- in the first video, we said that if not replaying, - the patched function will always return true, and if - the marker doesn't exist, it will add it, and if - the marker already exists, it won't re-add it. - - But what this - is saying is that this doesn't hold if there was already - a call to patched with that ID in the replay code, but not - in the event history. -""" - import asyncio -from temporalio import activity, workflow +from temporalio import workflow from temporalio.client import Client from temporalio.worker import Worker From 612c374e2355f7a83db08beefbfcd17bae0fc9a7 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sun, 26 Jan 2025 18:23:46 -0600 Subject: [PATCH 03/27] good --- hello/hello_patch_grant_README.md | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/hello/hello_patch_grant_README.md b/hello/hello_patch_grant_README.md index 7c8066fa..c88bf877 100644 --- a/hello/hello_patch_grant_README.md +++ b/hello/hello_patch_grant_README.md @@ -1,5 +1,7 @@ # Hello Patch Tutorial +> ***This applies to Python and Dotnet SDKs*** + This tutorial has two associated python files and [a YouTube playlist](https://www.youtube.com/playlist?list=PLytZkHFJwKUdfxFQnuo0Fson0QM0VL9hL). @@ -41,7 +43,7 @@ of the patched function. We'll show what it does when replaying. - if replaying, and the code has a call to patched, - - if the patch ID is somewhere in the event history + - if the patch ID exists anywhere in the event history - if the event is after where we currently are in the event history, then, in other words, our patch is before the @@ -97,3 +99,27 @@ Why is this a caveat? is saying is that this doesn't hold if there was already a call to patched with that ID in the replay code, but not in the event history. + +## A Summary of the Two Potentially Unexpected Behaviors + +(neither of these apply to typescript, but they both apply +to python and dotnet) + +1. When Replaying, in the scenario of ***it hits a call to + patched, but that patch ID isn't before/on that point in + the event history***, you may not expect that + the event history *after* where you currently + are matters. Because: + 1. If that patch ID exists later, you get an NDE (Video 2, situation 1) + (this doesn't happen in typescript... in typescript, it behaves + like the bullet below) + 2. If it doesn't exist later, you don't get an NDE, and + it returns false. (Video 3, main situation) (this is + the same between python, ts, and dotnet) +2. When Replaying, if you hit a call to patched with an ID that + doesn't exist in the history, then not only will it return + false in that occurence, but it will also return false if + the execution surpasses the Replay threshold and is running new code. + (Video 3, main situation) (this doesn't happen in TS -- it will + return false in that occurence, but this doesn't modify the behavior + of future calls). From 8340fbcfb2c3eacd6fdd3f77d8f3e1912bf8b411 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sun, 26 Jan 2025 19:05:18 -0600 Subject: [PATCH 04/27] added summary of all three SDKs --- ...hello_patch_grant_README_all_three_SDKs.md | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 hello/hello_patch_grant_README_all_three_SDKs.md diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md new file mode 100644 index 00000000..47f9e8bf --- /dev/null +++ b/hello/hello_patch_grant_README_all_three_SDKs.md @@ -0,0 +1,123 @@ +# `Patched()` function Deep Dive + +This tutorial has two associated python files and +[a YouTube playlist](https://www.youtube.com/playlist?list=PLytZkHFJwKUdfxFQnuo0Fson0QM0VL9hL). + +The pastable snippets are here: + +## Not Replaying + +- if not replaying, and the execution hits a call to patched, + it first checks the event history, and: + - if the patch ID is not in the event history, + it will add a marker to the + event history and upsert a search attribute + (you can think of this like the first block of patching with + a given patch ID) + - if the patch ID is in the event history, + it won't modify the history + (you can think of this like the second + block of patching of a given patch ID) +- In either case, it will return true + +> there is a caveat to the above in Python and Dotnet, and +> we will discuss that below + +## Replaying With Marker Before-Or-At Current Location + +- if replaying, + - if the code has a call to patched, and if the event history + has a marker from a call to patched in the same place (which means it + will match the original event history), then + write a marker to the replay event history and return true. + *This is similar to the behavior of the non-replay case, and + just like in that one, you can think of this like the first block of patching with + a given patch ID* + - if the code has a call to patched, and the event history + has a marker with that Patch ID earlier in the history, + then it will simply return true and not modify the + replay event history. + *This is similar to the behavior of the non-replay case, and just + like in that case, you can think of this like the second + block of patching of a given patch ID* + +## Replaying With Marker After Current Location or No Marker at All + +We have covered what happens when replaying and the code +hits a call to patched and there's a marker in the event +history on or before that spot in the execution. What remains +is what happens if (1) the marker is after that spot in the +execution or (2) if there is no marker at all for that patch. + +In these situations, the TypeScript SDK behaves in one way, +and the Python and Dotnet SDKs behave in another. + +### (1) There is a marker after that spot in the execution + +#### (1) TypeScript With Later Marker + +Returns False. + +#### (1) Python and Dotnet With Later Marker + +if the event is after where we currently are +in the event history, then, in other words, +our patch is before the +event, then our patch is too early. it will +attempt to write the marker to the replay event +history, but it will throw a non-deterministic +exception because the replay and original event +histories don't match + +### (2) There is no marker for that Patch ID + +#### (2) TypeScript With No Marker + +Returns False. + +#### (2) Python and Dotnet With No Marker + +it will return false and not add anything to +the event history. Furthermore, ***and this is the +caveat mentioned in the very first section***, it will make all future calls to patched +with that ID false -- even after it is done replaying +and is running new code. + +Why is this a caveat? + +in the first video, we said that ***if not replaying, +the patched function will always return true***, and if +the marker doesn't exist, it will add it, and if +the marker already exists, it won't re-add it. + +But what this +is saying is that this doesn't hold if there was already +a call to patched with that ID in the replay code, but not +in the event history. In this situation, it won't return +true. + +### A Summary of the Two Potentially Unexpected Behaviors + +(neither of these apply to typescript, but they both apply +to python and dotnet) + +1. When Replaying, in the scenario of ***it hits a call to + patched, but that patch ID isn't before/on that point in + the event history***, you may not expect that + the event history *after* where you currently + are matters. Because: + 1. If that patch ID exists later, you get an NDE [(see Python and Dotnet Later Marker above)](#1-python-and-dotnet-with-later-marker). + 2. If it doesn't exist later, you don't get an NDE, and + it returns false + [(see Python and Dotnet No Marker above)](#2-python-and-dotnet-with-no-marker). + + (In TypeScript, both of these just return false with no NDE) +2. When Replaying, if you hit a call to patched with an ID that + doesn't exist in the history, then not only will it return + false in that occurence, but it will also return false if + the execution surpasses the Replay threshold and is running new code. + [(see Python and Dotnet No Marker above)](#2-python-and-dotnet-with-no-marker). + + (This doesn't happen in TS -- it will + return false in that occurence, but this doesn't modify the behavior + of future calls). From 4a9ba341633a5a9bb7b1a95d1f04eb6aacf77ad9 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sun, 26 Jan 2025 19:31:41 -0600 Subject: [PATCH 05/27] finish implications --- ...hello_patch_grant_README_all_three_SDKs.md | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md index 47f9e8bf..ee72f31e 100644 --- a/hello/hello_patch_grant_README_all_three_SDKs.md +++ b/hello/hello_patch_grant_README_all_three_SDKs.md @@ -121,3 +121,32 @@ to python and dotnet) (This doesn't happen in TS -- it will return false in that occurence, but this doesn't modify the behavior of future calls). + +### Summary of TypeScript's Behavior + +- In TypeScript, the behavior of hitting a patched statement is really simple and can be summarized + succintly: + - if there's a marker + as the next event in the history, add a marker to the replay + event history and return true + - if there's a marker before in the event history, return true + but don't add a marker to the event history + - there's no marker on or befor, return false and don't add + anything to the event history. + - lastly, as I hope you'd expect, if there's a marker in the event history, + but no call to patched right there, throw an NDE. + +### Implications of the Behaviors + +- In TypeScript, if you deploy new code, it will run it if it is + not replaying, and if it is replaying, it will just do what + it did the last time. + - this means that if it has gotten through some of your code, then + the worker crashes and you deploy new code, then when it replays, + it will use the old code throughout the replay, but switch over + to new code after it has passed the replay threshold. This means + your new code and your old code must work together. +- In Python and Dotnet, if you deploy new code while a worker is down, + any workflows that were in the middle of executing will replay + using old code and continue using old code throughout the rest of + their execution (i.e. they won't switch to the new code). From d334f23fd44db3e2fc01c9e68a98109eb51b4810 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sun, 26 Jan 2025 19:38:54 -0600 Subject: [PATCH 06/27] fix --- hello/hello_patch_grant_README_all_three_SDKs.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md index ee72f31e..ff15b1c5 100644 --- a/hello/hello_patch_grant_README_all_three_SDKs.md +++ b/hello/hello_patch_grant_README_all_three_SDKs.md @@ -150,3 +150,7 @@ to python and dotnet) any workflows that were in the middle of executing will replay using old code and continue using old code throughout the rest of their execution (i.e. they won't switch to the new code). + - Note that this means that the Workflow ***does not always run + the newest code***. It only does that if not replaying or if + passed replay and there hasn't been a call to patched (with that ID) throughout + the replay From b01f910a542babafc17ddf738ea6fd52a2649ce9 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sun, 26 Jan 2025 19:52:07 -0600 Subject: [PATCH 07/27] good --- .../hello_patch_grant_README_all_three_SDKs.md | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md index ff15b1c5..33b8c6ee 100644 --- a/hello/hello_patch_grant_README_all_three_SDKs.md +++ b/hello/hello_patch_grant_README_all_three_SDKs.md @@ -148,9 +148,20 @@ to python and dotnet) your new code and your old code must work together. - In Python and Dotnet, if you deploy new code while a worker is down, any workflows that were in the middle of executing will replay - using old code and continue using old code throughout the rest of - their execution (i.e. they won't switch to the new code). + using old code and then for the rest of the execution, they + will either: + - use new code if there was no call to patched in the replay code + - if there was a call to patched in the replay code, they will + run the rest of the code with the new code + + This might sound odd, but it's actually exactly what's needed because + that means that if the future patched code depends on earlier patched code, + then it won't use the new code -- it will use the old code. But if + there's new code in the future, and there was no code earlier in the + body that required the new patch, then it can switch over to the new code, + and it will do that. + - Note that this means that the Workflow ***does not always run the newest code***. It only does that if not replaying or if passed replay and there hasn't been a call to patched (with that ID) throughout - the replay + the replay. From 1759806154d06ea307b66394f0b40711e6add693 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sun, 26 Jan 2025 19:54:54 -0600 Subject: [PATCH 08/27] cleaning the explanation --- ...hello_patch_grant_README_all_three_SDKs.md | 61 +++++++++++-------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md index 33b8c6ee..ed4f86ed 100644 --- a/hello/hello_patch_grant_README_all_three_SDKs.md +++ b/hello/hello_patch_grant_README_all_three_SDKs.md @@ -138,30 +138,37 @@ to python and dotnet) ### Implications of the Behaviors -- In TypeScript, if you deploy new code, it will run it if it is - not replaying, and if it is replaying, it will just do what - it did the last time. - - this means that if it has gotten through some of your code, then - the worker crashes and you deploy new code, then when it replays, - it will use the old code throughout the replay, but switch over - to new code after it has passed the replay threshold. This means - your new code and your old code must work together. -- In Python and Dotnet, if you deploy new code while a worker is down, - any workflows that were in the middle of executing will replay - using old code and then for the rest of the execution, they - will either: - - use new code if there was no call to patched in the replay code - - if there was a call to patched in the replay code, they will - run the rest of the code with the new code - - This might sound odd, but it's actually exactly what's needed because - that means that if the future patched code depends on earlier patched code, - then it won't use the new code -- it will use the old code. But if - there's new code in the future, and there was no code earlier in the - body that required the new patch, then it can switch over to the new code, - and it will do that. - - - Note that this means that the Workflow ***does not always run - the newest code***. It only does that if not replaying or if - passed replay and there hasn't been a call to patched (with that ID) throughout - the replay. +#### Implications for TypeScript + +If you deploy new code, it will run it if it is +not replaying, and if it is replaying, it will just do what +it did the last time. + +- this means that if it has gotten through some of your code, then + the worker crashes and you deploy new code, then when it replays, + it will use the old code throughout the replay, but switch over + to new code after it has passed the replay threshold. This means + your new code and your old code must work together. + +#### Implications for Python and Dotnet + +In Python and Dotnet, if you deploy new code while a worker is down, +any workflows that were in the middle of executing will replay +using old code and then for the rest of the execution, they +will either: + +- use new code if there was no call to patched in the replay code +- if there was a call to patched in the replay code, they will + run the rest of the code with the new code + +This might sound odd, but it's actually exactly what's needed because +that means that if the future patched code depends on earlier patched code, +then it won't use the new code -- it will use the old code. But if +there's new code in the future, and there was no code earlier in the +body that required the new patch, then it can switch over to the new code, +and it will do that. + +Note that this behavior means that the Workflow ***does not always run +the newest code***. It only does that if not replaying or if +surpassed replay and there hasn't been a call to patched (with that ID) throughout +the replay. From 29dab6e7b91c7616a97c961b201cf90c778996da Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sun, 26 Jan 2025 20:06:32 -0600 Subject: [PATCH 09/27] good --- hello/hello_patch_grant_README_all_three_SDKs.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md index ed4f86ed..4fa0d478 100644 --- a/hello/hello_patch_grant_README_all_three_SDKs.md +++ b/hello/hello_patch_grant_README_all_three_SDKs.md @@ -144,11 +144,11 @@ If you deploy new code, it will run it if it is not replaying, and if it is replaying, it will just do what it did the last time. -- this means that if it has gotten through some of your code, then - the worker crashes and you deploy new code, then when it replays, - it will use the old code throughout the replay, but switch over - to new code after it has passed the replay threshold. This means - your new code and your old code must work together. +this means that if it has gotten through some of your code, then +the worker crashes and you deploy new code, then when it replays, +it will use the old code throughout the replay, but switch over +to new code after it has passed the replay threshold. This means +your new code and your old code must work together. #### Implications for Python and Dotnet @@ -157,9 +157,9 @@ any workflows that were in the middle of executing will replay using old code and then for the rest of the execution, they will either: -- use new code if there was no call to patched in the replay code -- if there was a call to patched in the replay code, they will - run the rest of the code with the new code +1. use new code if there was no call to patched in the replay code +2. if there was a call to patched in the replay code, they will + run the rest of the code with the new code This might sound odd, but it's actually exactly what's needed because that means that if the future patched code depends on earlier patched code, From 689207653088163144087847d28e1accc1eb5d21 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Mon, 27 Jan 2025 09:05:04 -0600 Subject: [PATCH 10/27] added TS example --- ...hello_patch_grant_README_all_three_SDKs.md | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md index 4fa0d478..fd92a2f7 100644 --- a/hello/hello_patch_grant_README_all_three_SDKs.md +++ b/hello/hello_patch_grant_README_all_three_SDKs.md @@ -148,7 +148,35 @@ this means that if it has gotten through some of your code, then the worker crashes and you deploy new code, then when it replays, it will use the old code throughout the replay, but switch over to new code after it has passed the replay threshold. This means -your new code and your old code must work together. +your new code and your old code must work together. For example, +if your Workflow Definition originally looked like this: + +```ts +console.log('original code before the sleep') +await sleep(10000); // <-- Kill the Worker while this is waiting, and deploy the new code below +console.log('original code after the sleep') +``` + +Now we kill the Worker during the sleep, and wrap our original +code in the else part of a patched `if` statement, and start +our Worker again. + +```ts +if (patched('my-change-id')) { + console.log('new code before the sleep') +} else { + console.log('original code before the sleep') // this will run +} +await sleep(10000); +if (patched('my-change-id')) { + console.log('new code after the sleep') // this will run +} else { + console.log('original code after the sleep') +} +``` + +In the first part, it will be Replaying, and it will run the old code, +and after the sleep, it won't be Replaying, and it will run the new code. #### Implications for Python and Dotnet From 3a1ab1ceafd9c3560c58b756af1d75300ed4f383 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Wed, 5 Feb 2025 12:22:52 -0600 Subject: [PATCH 11/27] remove writeup --- ...hello_patch_grant_README_all_three_SDKs.md | 202 ------------------ prometheus/README.md | 4 +- 2 files changed, 3 insertions(+), 203 deletions(-) delete mode 100644 hello/hello_patch_grant_README_all_three_SDKs.md diff --git a/hello/hello_patch_grant_README_all_three_SDKs.md b/hello/hello_patch_grant_README_all_three_SDKs.md deleted file mode 100644 index fd92a2f7..00000000 --- a/hello/hello_patch_grant_README_all_three_SDKs.md +++ /dev/null @@ -1,202 +0,0 @@ -# `Patched()` function Deep Dive - -This tutorial has two associated python files and -[a YouTube playlist](https://www.youtube.com/playlist?list=PLytZkHFJwKUdfxFQnuo0Fson0QM0VL9hL). - -The pastable snippets are here: - -## Not Replaying - -- if not replaying, and the execution hits a call to patched, - it first checks the event history, and: - - if the patch ID is not in the event history, - it will add a marker to the - event history and upsert a search attribute - (you can think of this like the first block of patching with - a given patch ID) - - if the patch ID is in the event history, - it won't modify the history - (you can think of this like the second - block of patching of a given patch ID) -- In either case, it will return true - -> there is a caveat to the above in Python and Dotnet, and -> we will discuss that below - -## Replaying With Marker Before-Or-At Current Location - -- if replaying, - - if the code has a call to patched, and if the event history - has a marker from a call to patched in the same place (which means it - will match the original event history), then - write a marker to the replay event history and return true. - *This is similar to the behavior of the non-replay case, and - just like in that one, you can think of this like the first block of patching with - a given patch ID* - - if the code has a call to patched, and the event history - has a marker with that Patch ID earlier in the history, - then it will simply return true and not modify the - replay event history. - *This is similar to the behavior of the non-replay case, and just - like in that case, you can think of this like the second - block of patching of a given patch ID* - -## Replaying With Marker After Current Location or No Marker at All - -We have covered what happens when replaying and the code -hits a call to patched and there's a marker in the event -history on or before that spot in the execution. What remains -is what happens if (1) the marker is after that spot in the -execution or (2) if there is no marker at all for that patch. - -In these situations, the TypeScript SDK behaves in one way, -and the Python and Dotnet SDKs behave in another. - -### (1) There is a marker after that spot in the execution - -#### (1) TypeScript With Later Marker - -Returns False. - -#### (1) Python and Dotnet With Later Marker - -if the event is after where we currently are -in the event history, then, in other words, -our patch is before the -event, then our patch is too early. it will -attempt to write the marker to the replay event -history, but it will throw a non-deterministic -exception because the replay and original event -histories don't match - -### (2) There is no marker for that Patch ID - -#### (2) TypeScript With No Marker - -Returns False. - -#### (2) Python and Dotnet With No Marker - -it will return false and not add anything to -the event history. Furthermore, ***and this is the -caveat mentioned in the very first section***, it will make all future calls to patched -with that ID false -- even after it is done replaying -and is running new code. - -Why is this a caveat? - -in the first video, we said that ***if not replaying, -the patched function will always return true***, and if -the marker doesn't exist, it will add it, and if -the marker already exists, it won't re-add it. - -But what this -is saying is that this doesn't hold if there was already -a call to patched with that ID in the replay code, but not -in the event history. In this situation, it won't return -true. - -### A Summary of the Two Potentially Unexpected Behaviors - -(neither of these apply to typescript, but they both apply -to python and dotnet) - -1. When Replaying, in the scenario of ***it hits a call to - patched, but that patch ID isn't before/on that point in - the event history***, you may not expect that - the event history *after* where you currently - are matters. Because: - 1. If that patch ID exists later, you get an NDE [(see Python and Dotnet Later Marker above)](#1-python-and-dotnet-with-later-marker). - 2. If it doesn't exist later, you don't get an NDE, and - it returns false - [(see Python and Dotnet No Marker above)](#2-python-and-dotnet-with-no-marker). - - (In TypeScript, both of these just return false with no NDE) -2. When Replaying, if you hit a call to patched with an ID that - doesn't exist in the history, then not only will it return - false in that occurence, but it will also return false if - the execution surpasses the Replay threshold and is running new code. - [(see Python and Dotnet No Marker above)](#2-python-and-dotnet-with-no-marker). - - (This doesn't happen in TS -- it will - return false in that occurence, but this doesn't modify the behavior - of future calls). - -### Summary of TypeScript's Behavior - -- In TypeScript, the behavior of hitting a patched statement is really simple and can be summarized - succintly: - - if there's a marker - as the next event in the history, add a marker to the replay - event history and return true - - if there's a marker before in the event history, return true - but don't add a marker to the event history - - there's no marker on or befor, return false and don't add - anything to the event history. - - lastly, as I hope you'd expect, if there's a marker in the event history, - but no call to patched right there, throw an NDE. - -### Implications of the Behaviors - -#### Implications for TypeScript - -If you deploy new code, it will run it if it is -not replaying, and if it is replaying, it will just do what -it did the last time. - -this means that if it has gotten through some of your code, then -the worker crashes and you deploy new code, then when it replays, -it will use the old code throughout the replay, but switch over -to new code after it has passed the replay threshold. This means -your new code and your old code must work together. For example, -if your Workflow Definition originally looked like this: - -```ts -console.log('original code before the sleep') -await sleep(10000); // <-- Kill the Worker while this is waiting, and deploy the new code below -console.log('original code after the sleep') -``` - -Now we kill the Worker during the sleep, and wrap our original -code in the else part of a patched `if` statement, and start -our Worker again. - -```ts -if (patched('my-change-id')) { - console.log('new code before the sleep') -} else { - console.log('original code before the sleep') // this will run -} -await sleep(10000); -if (patched('my-change-id')) { - console.log('new code after the sleep') // this will run -} else { - console.log('original code after the sleep') -} -``` - -In the first part, it will be Replaying, and it will run the old code, -and after the sleep, it won't be Replaying, and it will run the new code. - -#### Implications for Python and Dotnet - -In Python and Dotnet, if you deploy new code while a worker is down, -any workflows that were in the middle of executing will replay -using old code and then for the rest of the execution, they -will either: - -1. use new code if there was no call to patched in the replay code -2. if there was a call to patched in the replay code, they will - run the rest of the code with the new code - -This might sound odd, but it's actually exactly what's needed because -that means that if the future patched code depends on earlier patched code, -then it won't use the new code -- it will use the old code. But if -there's new code in the future, and there was no code earlier in the -body that required the new patch, then it can switch over to the new code, -and it will do that. - -Note that this behavior means that the Workflow ***does not always run -the newest code***. It only does that if not replaying or if -surpassed replay and there hasn't been a call to patched (with that ID) throughout -the replay. diff --git a/prometheus/README.md b/prometheus/README.md index 6f605d62..c1ef6b99 100644 --- a/prometheus/README.md +++ b/prometheus/README.md @@ -13,5 +13,7 @@ Then, in another terminal, run the following to execute a workflow: poetry run python starter.py -After executing the workflow, the process will stay open so the metrics if this separate process can be accessed at +After executing the workflow, the process will stay open so the metrics will be visible. + +This separate process can be accessed at http://127.0.0.1:9001/metrics. \ No newline at end of file From 56b3047daf4b466ea63385c2020794626066d830 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Wed, 5 Feb 2025 12:25:29 -0600 Subject: [PATCH 12/27] delete my scratch --- hello/hello_patch_grant_README.md | 125 ------------------------ hello/hello_patch_grant_run_worker.py | 28 ------ hello/hello_patch_grant_run_workflow.py | 29 ------ 3 files changed, 182 deletions(-) delete mode 100644 hello/hello_patch_grant_README.md delete mode 100644 hello/hello_patch_grant_run_worker.py delete mode 100644 hello/hello_patch_grant_run_workflow.py diff --git a/hello/hello_patch_grant_README.md b/hello/hello_patch_grant_README.md deleted file mode 100644 index c88bf877..00000000 --- a/hello/hello_patch_grant_README.md +++ /dev/null @@ -1,125 +0,0 @@ -# Hello Patch Tutorial - -> ***This applies to Python and Dotnet SDKs*** - -This tutorial has two associated python files and -[a YouTube playlist](https://www.youtube.com/playlist?list=PLytZkHFJwKUdfxFQnuo0Fson0QM0VL9hL). - -The pastable snippets are here: - -```python - await asyncio.sleep(10) - print(f"v1val: {workflow.patched("v1")}") -``` - -## Video Explanations - -Each video had some explanation text at the top -- -that is pasted below: - -### Video 1: Not Replaying - -In this video, we'll begin to discuss the behavior -of the patched function. We'll start by -showing what it does when not replaying, and -in another video, we'll show what happens when replaying. - -- if not replaying, and the execution hits a call to patched, - it first checks the event history, and: - - if the patch ID is not in the event history, - it will add a marker to the - event history and upsert a search attribute - - if the patch ID is in the event history, - it won't modify the history -- In either case, it will return true - -> there is a caveat to the above, and -> we will get to that in a later video - -### Video 2: Replaying with Patch ID in the history - -In this video, we'll continue our discussion of the behavior -of the patched function. We'll -show what it does when replaying. - -- if replaying, and the code has a call to patched, - - if the patch ID exists anywhere in the event history - - if the event is after where we currently are - in the event history, then, in other words, - our patch is before the - event, then our patch is too early. it will - attempt to write the marker to the replay event - history, but it will throw a non-deterministic - exception because the replay and original event - histories don't match - - if the event is before where we currently are - in the execution, and there hasn't been a call to patched - with that id yet, then it won't even get to this call - because an NDE would have already been thrown - - if the event is where we currently are in the - event history, it will return true and add a - marker to the replay event history (which means it - will match the original event history) and it will - continue. - *This is similar to the behavior of the non-replay case* - - if the event is before where we currently are - in the execution, meaning the replay already has seen - a call to patched with this ID, - it will return true and not - do anything to the replay event history - *This is similar to the behavior of the non-replay case* - - if the patch ID is not anywhere in the event history - - we will discuss in the next video - -### Video 3: Replaying with Patch ID not in the history - -In this video, we'll finish our discussion of the behavior -of the patched function. We'll -show what it does when replaying, and the patch ID -is nowhere in the event history. - -- if replaying, and the code has a call to patched, - - if the patch ID is somewhere in the event history - - ... - - if the patch ID is not anywhere in the event history - - it will return false and not add anything to - the event history. Furthermore, and this is the - caveat, it will make all future calls to patched - with that ID false -- even after it is done replaying - and is running normal code. - -Why is this a caveat? - -- in the first video, we said that if not replaying, - the patched function will always return true, and if - the marker doesn't exist, it will add it, and if - the marker already exists, it won't re-add it. - - But what this - is saying is that this doesn't hold if there was already - a call to patched with that ID in the replay code, but not - in the event history. - -## A Summary of the Two Potentially Unexpected Behaviors - -(neither of these apply to typescript, but they both apply -to python and dotnet) - -1. When Replaying, in the scenario of ***it hits a call to - patched, but that patch ID isn't before/on that point in - the event history***, you may not expect that - the event history *after* where you currently - are matters. Because: - 1. If that patch ID exists later, you get an NDE (Video 2, situation 1) - (this doesn't happen in typescript... in typescript, it behaves - like the bullet below) - 2. If it doesn't exist later, you don't get an NDE, and - it returns false. (Video 3, main situation) (this is - the same between python, ts, and dotnet) -2. When Replaying, if you hit a call to patched with an ID that - doesn't exist in the history, then not only will it return - false in that occurence, but it will also return false if - the execution surpasses the Replay threshold and is running new code. - (Video 3, main situation) (this doesn't happen in TS -- it will - return false in that occurence, but this doesn't modify the behavior - of future calls). diff --git a/hello/hello_patch_grant_run_worker.py b/hello/hello_patch_grant_run_worker.py deleted file mode 100644 index 0adc5cec..00000000 --- a/hello/hello_patch_grant_run_worker.py +++ /dev/null @@ -1,28 +0,0 @@ -import asyncio - -from temporalio import workflow -from temporalio.client import Client -from temporalio.worker import Worker - -@workflow.defn() -class MyWorkflow: - @workflow.run - async def run(self) -> str: - - pass - -async def main(): - - client = await Client.connect("localhost:7233") - - worker = Worker( - client, - task_queue="hello-patch-task-queue", - workflows=[MyWorkflow], - ) - - await worker.run() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/hello/hello_patch_grant_run_workflow.py b/hello/hello_patch_grant_run_workflow.py deleted file mode 100644 index 9f995aca..00000000 --- a/hello/hello_patch_grant_run_workflow.py +++ /dev/null @@ -1,29 +0,0 @@ -import asyncio -import sys -from dataclasses import dataclass -from datetime import timedelta - -from temporalio import activity, exceptions, workflow -from temporalio.client import Client -from temporalio.worker import Worker - -from hello.hello_patch_grant_run_worker import MyWorkflow - - -async def main(): - - # Uncomment the lines below to see logging output - # import logging - # logging.basicConfig(level=logging.INFO) - - client = await Client.connect("localhost:7233") - - await client.execute_workflow( - MyWorkflow.run, # type: ignore - id="hello-patch-workflow-id", - task_queue="hello-patch-task-queue", - ) - - -if __name__ == "__main__": - asyncio.run(main()) From 853bc01ee086abb64f4b1f08012b701c1d52896b Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 11:08:31 -0600 Subject: [PATCH 13/27] probing --- probing/README.md | 13 +++++++++++ probing/main.py | 38 +++++++++++++++++++++++++++++++ probing/probing_code.py | 47 +++++++++++++++++++++++++++++++++++++++ probing/your_workflows.py | 46 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 probing/README.md create mode 100644 probing/main.py create mode 100644 probing/probing_code.py create mode 100644 probing/your_workflows.py diff --git a/probing/README.md b/probing/README.md new file mode 100644 index 00000000..daed3ee4 --- /dev/null +++ b/probing/README.md @@ -0,0 +1,13 @@ +# Probing Sample + +This can help you determine if your event loop is clogged. + +The idea is that you could add this probing workflow +to your code, and +if it doesn't log every second, then something long-running +is clogging the event loop. + +> It doesn't tell you what is clogging it, but it tells you +> whether something is clogging it. + +run `poetry run python probing/main.py` diff --git a/probing/main.py b/probing/main.py new file mode 100644 index 00000000..b66fb296 --- /dev/null +++ b/probing/main.py @@ -0,0 +1,38 @@ +import asyncio + +from temporalio.client import Client +from temporalio.worker import Worker + +from probing.probing_code import ProbingWorkflow, probing_activity +from probing.your_workflows import YourWorkflow, your_activity + + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="probing-task-queue", + workflows=[ProbingWorkflow, YourWorkflow], + activities=[probing_activity, your_activity], + ): + + # add this to your code + await client.start_workflow( + ProbingWorkflow.run, + id="probing", + task_queue="probing-task-queue", + ) + + while True: + # simulate running your workflows + + await client.execute_workflow( + YourWorkflow.run, + id="your-workflow", + task_queue="probing-task-queue", + ) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/probing/probing_code.py b/probing/probing_code.py new file mode 100644 index 00000000..5fe9c680 --- /dev/null +++ b/probing/probing_code.py @@ -0,0 +1,47 @@ +""" +You could add this to your code +""" + +from datetime import timedelta + +import asyncio +import time +from temporalio import activity +from temporalio import workflow + + +@activity.defn +async def probing_activity() -> None: + """ + Here's the activity that can probe your worker and see if it's + still responsive. + """ + t_prev = time.time() + while True: + wait_length = 1 + await asyncio.sleep(wait_length) + delta = time.time() - t_prev + extra_time = delta - wait_length + activity.logger.info(f"probing showed the event loop took {round(extra_time,1)} extra seconds") + print(f"probing showed the event loop took {round(extra_time,1)} extra seconds") + t_prev = time.time() + + + +@workflow.defn +class ProbingWorkflow: + """ + Here's the workflow that can probe your worker and see if it's + still responsive. + """ + + @workflow.run + async def run(self) -> str: + + return await workflow.execute_activity( + probing_activity, + + # these timeouts are going to be tricky because if the event loop + # is indeed blocked, the heartbeats etc may not behave as expected. + start_to_close_timeout=timedelta(seconds=60 * 100), + ) diff --git a/probing/your_workflows.py b/probing/your_workflows.py new file mode 100644 index 00000000..83ca8e5d --- /dev/null +++ b/probing/your_workflows.py @@ -0,0 +1,46 @@ +""" +This simulates your code. + +You can actually use your own code, but if you want +to use this code as a playground, you can change +the amount of time in the time.sleep() call in +your_activity(). +""" + +import asyncio +import time +from temporalio import activity + +from datetime import timedelta + +from temporalio import workflow + +@activity.defn +async def your_activity() -> None: + """ + Here's the activity that's in your codebase. + + You can experiment with this one to see how it behaves. + """ + + t0 = time.time() + + # this simulates a long-running activity. this is the piece that we don't + # know if your code has it or not. This is what we're probing for. + time.sleep(2) + + activity.logger.info(f"your activity has finished after: {round(time.time() - t0,0)} seconds") + print(f"your activity has finished after: {round(time.time() - t0,0)} seconds") + + + +@workflow.defn +class YourWorkflow: + @workflow.run + async def run(self) -> str: + + return await workflow.execute_activity( + your_activity, + start_to_close_timeout=timedelta(seconds=60 * 100), + ) + From 125fce6a1cff4e143095bfb1a8aabcf9813b6b1a Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 11:17:49 -0600 Subject: [PATCH 14/27] renamed probing to canary --- {probing => canary}/README.md | 0 {probing => canary}/main.py | 0 {probing => canary}/probing_code.py | 0 {probing => canary}/your_workflows.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {probing => canary}/README.md (100%) rename {probing => canary}/main.py (100%) rename {probing => canary}/probing_code.py (100%) rename {probing => canary}/your_workflows.py (100%) diff --git a/probing/README.md b/canary/README.md similarity index 100% rename from probing/README.md rename to canary/README.md diff --git a/probing/main.py b/canary/main.py similarity index 100% rename from probing/main.py rename to canary/main.py diff --git a/probing/probing_code.py b/canary/probing_code.py similarity index 100% rename from probing/probing_code.py rename to canary/probing_code.py diff --git a/probing/your_workflows.py b/canary/your_workflows.py similarity index 100% rename from probing/your_workflows.py rename to canary/your_workflows.py From 1eab88008f57477515a77908e298fabf302a7c6b Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 11:21:29 -0600 Subject: [PATCH 15/27] canary fixes --- canary/README.md | 6 +++--- canary/{probing_code.py => canary_code.py} | 10 +++++----- canary/main.py | 18 +++++++++--------- canary/your_workflows.py | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) rename canary/{probing_code.py => canary_code.py} (76%) diff --git a/canary/README.md b/canary/README.md index daed3ee4..85728f1e 100644 --- a/canary/README.md +++ b/canary/README.md @@ -1,8 +1,8 @@ -# Probing Sample +# Canary Sample This can help you determine if your event loop is clogged. -The idea is that you could add this probing workflow +The idea is that you could add this canary workflow to your code, and if it doesn't log every second, then something long-running is clogging the event loop. @@ -10,4 +10,4 @@ is clogging the event loop. > It doesn't tell you what is clogging it, but it tells you > whether something is clogging it. -run `poetry run python probing/main.py` +run `poetry run python canary/main.py` diff --git a/canary/probing_code.py b/canary/canary_code.py similarity index 76% rename from canary/probing_code.py rename to canary/canary_code.py index 5fe9c680..934f1da1 100644 --- a/canary/probing_code.py +++ b/canary/canary_code.py @@ -11,7 +11,7 @@ @activity.defn -async def probing_activity() -> None: +async def canary_activity() -> None: """ Here's the activity that can probe your worker and see if it's still responsive. @@ -22,14 +22,14 @@ async def probing_activity() -> None: await asyncio.sleep(wait_length) delta = time.time() - t_prev extra_time = delta - wait_length - activity.logger.info(f"probing showed the event loop took {round(extra_time,1)} extra seconds") - print(f"probing showed the event loop took {round(extra_time,1)} extra seconds") + activity.logger.info(f"The canary showed the event loop took {round(extra_time,1)} extra seconds") + print(f"The canary showed the event loop took {round(extra_time,1)} extra seconds") t_prev = time.time() @workflow.defn -class ProbingWorkflow: +class CanaryWorkflow: """ Here's the workflow that can probe your worker and see if it's still responsive. @@ -39,7 +39,7 @@ class ProbingWorkflow: async def run(self) -> str: return await workflow.execute_activity( - probing_activity, + canary_activity, # these timeouts are going to be tricky because if the event loop # is indeed blocked, the heartbeats etc may not behave as expected. diff --git a/canary/main.py b/canary/main.py index b66fb296..0bdef229 100644 --- a/canary/main.py +++ b/canary/main.py @@ -3,8 +3,8 @@ from temporalio.client import Client from temporalio.worker import Worker -from probing.probing_code import ProbingWorkflow, probing_activity -from probing.your_workflows import YourWorkflow, your_activity +from canary.canary_code import CanaryWorkflow, canary_activity +from canary.your_workflows import YourWorkflow, your_activity @@ -13,16 +13,16 @@ async def main(): async with Worker( client, - task_queue="probing-task-queue", - workflows=[ProbingWorkflow, YourWorkflow], - activities=[probing_activity, your_activity], + task_queue="canary-task-queue", + workflows=[CanaryWorkflow, YourWorkflow], + activities=[canary_activity, your_activity], ): # add this to your code await client.start_workflow( - ProbingWorkflow.run, - id="probing", - task_queue="probing-task-queue", + CanaryWorkflow.run, + id="canary", + task_queue="canary-task-queue", ) while True: @@ -31,7 +31,7 @@ async def main(): await client.execute_workflow( YourWorkflow.run, id="your-workflow", - task_queue="probing-task-queue", + task_queue="canary-task-queue", ) if __name__ == "__main__": diff --git a/canary/your_workflows.py b/canary/your_workflows.py index 83ca8e5d..fe9e371d 100644 --- a/canary/your_workflows.py +++ b/canary/your_workflows.py @@ -26,7 +26,7 @@ async def your_activity() -> None: t0 = time.time() # this simulates a long-running activity. this is the piece that we don't - # know if your code has it or not. This is what we're probing for. + # know if your code has it or not. This is what we're using the canary for. time.sleep(2) activity.logger.info(f"your activity has finished after: {round(time.time() - t0,0)} seconds") From f7ef9f95ada33f866e7f0308495bf7e6abf54e44 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 11:49:23 -0600 Subject: [PATCH 16/27] small canary improvements --- canary/canary_code.py | 12 +++++++----- canary/your_workflows.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/canary/canary_code.py b/canary/canary_code.py index 934f1da1..08392a63 100644 --- a/canary/canary_code.py +++ b/canary/canary_code.py @@ -10,6 +10,9 @@ from temporalio import workflow +_CANARY_WAIT_TIME = 3 + + @activity.defn async def canary_activity() -> None: """ @@ -18,12 +21,11 @@ async def canary_activity() -> None: """ t_prev = time.time() while True: - wait_length = 1 - await asyncio.sleep(wait_length) + await asyncio.sleep(_CANARY_WAIT_TIME) delta = time.time() - t_prev - extra_time = delta - wait_length - activity.logger.info(f"The canary showed the event loop took {round(extra_time,1)} extra seconds") - print(f"The canary showed the event loop took {round(extra_time,1)} extra seconds") + extra_time = delta - _CANARY_WAIT_TIME + activity.logger.info(f"The canary showed the event loop took {round(extra_time,1)} seconds to get back after its await finished") + print(f"The canary showed the event loop took {round(extra_time,1)} seconds to get back after its await finished") t_prev = time.time() diff --git a/canary/your_workflows.py b/canary/your_workflows.py index fe9e371d..5d6d4930 100644 --- a/canary/your_workflows.py +++ b/canary/your_workflows.py @@ -9,6 +9,7 @@ import asyncio import time +import random from temporalio import activity from datetime import timedelta @@ -27,10 +28,15 @@ async def your_activity() -> None: # this simulates a long-running activity. this is the piece that we don't # know if your code has it or not. This is what we're using the canary for. - time.sleep(2) + r = random.random() + time.sleep(.5 + r) - activity.logger.info(f"your activity has finished after: {round(time.time() - t0,0)} seconds") - print(f"your activity has finished after: {round(time.time() - t0,0)} seconds") + # if you replace the time.sleep() with an asyncio.sleep(), + # the canary will detect no blocking. + # asyncio.sleep(.5 + r) + + activity.logger.info(f"your activity has finished after: {round(time.time() - t0,1)} seconds") + print(f"your activity has finished after: {round(time.time() - t0,1)} seconds") From 1d0f59a75652a110a7a7083d6bd18ec9f5b095b7 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 11:51:55 -0600 Subject: [PATCH 17/27] small canary note --- canary/canary_code.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/canary/canary_code.py b/canary/canary_code.py index 08392a63..9f396c35 100644 --- a/canary/canary_code.py +++ b/canary/canary_code.py @@ -24,6 +24,10 @@ async def canary_activity() -> None: await asyncio.sleep(_CANARY_WAIT_TIME) delta = time.time() - t_prev extra_time = delta - _CANARY_WAIT_TIME + + # Log the extra time taken by the event loop to get back after the await + # If you want, you can turn this into a histogram and show the distribution. + # maybe you could even put it in your metrics. activity.logger.info(f"The canary showed the event loop took {round(extra_time,1)} seconds to get back after its await finished") print(f"The canary showed the event loop took {round(extra_time,1)} seconds to get back after its await finished") t_prev = time.time() From 76bca56c04efc4f62102fd1c473598498ac2c7c2 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 12:23:58 -0600 Subject: [PATCH 18/27] updated canary readme --- canary/README.md | 30 ++++++++++++++++++++++++++++-- canary/your_workflows.py | 3 +-- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/canary/README.md b/canary/README.md index 85728f1e..bd8fb380 100644 --- a/canary/README.md +++ b/canary/README.md @@ -7,7 +7,33 @@ to your code, and if it doesn't log every second, then something long-running is clogging the event loop. -> It doesn't tell you what is clogging it, but it tells you +> Note: it doesn't tell you what is clogging it, but it tells you > whether something is clogging it. -run `poetry run python canary/main.py` +## Example + +```txt +$ run `poetry run python canary/main.py` + +Your activity finished after 1.4 seconds +Your activity finished after 1.3 seconds +Your activity finished after 0.7 seconds +The canary showed the event loop took 0.4 seconds to get back after its await finished +Your activity finished after 1.0 seconds +Your activity finished after 0.6 seconds +Your activity finished after 0.7 seconds +Your activity finished after 1.3 seconds +The canary showed the event loop took 0.7 seconds to get back after its await finished +Your activity finished after 0.9 seconds +Your activity finished after 1.1 seconds +Your activity finished after 1.2 seconds +The canary showed the event loop took 0.3 seconds to get back after its await finished +Your activity finished after 1.2 seconds +Your activity finished after 1.4 seconds +Your activity finished after 0.5 seconds +The canary showed the event loop took 0.2 seconds to get back after its await finished +Your activity finished after 1.3 seconds +Your activity finished after 1.4 seconds +Your activity finished after 1.0 seconds +The canary showed the event loop took 0.9 seconds to get back after its await finished +``` diff --git a/canary/your_workflows.py b/canary/your_workflows.py index 5d6d4930..890cc959 100644 --- a/canary/your_workflows.py +++ b/canary/your_workflows.py @@ -35,8 +35,7 @@ async def your_activity() -> None: # the canary will detect no blocking. # asyncio.sleep(.5 + r) - activity.logger.info(f"your activity has finished after: {round(time.time() - t0,1)} seconds") - print(f"your activity has finished after: {round(time.time() - t0,1)} seconds") + print(f"Your activity finished after {round(time.time() - t0,1)} seconds") From cef42405b8d1eb48310b4cd388b5e9ff67a4699b Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 14:52:20 -0600 Subject: [PATCH 19/27] canary looking good --- canary/README.md | 54 +++++++++++++++++++------------ canary/canary_code.py | 11 +++---- canary/{main.py => run_worker.py} | 11 +------ canary/run_your_workflows.py | 24 ++++++++++++++ 4 files changed, 64 insertions(+), 36 deletions(-) rename canary/{main.py => run_worker.py} (70%) create mode 100644 canary/run_your_workflows.py diff --git a/canary/README.md b/canary/README.md index bd8fb380..1bafcff7 100644 --- a/canary/README.md +++ b/canary/README.md @@ -1,39 +1,53 @@ -# Canary Sample +# Event Loop Canary Sample This can help you determine if your event loop is clogged. The idea is that you could add this canary workflow -to your code, and -if it doesn't log every second, then something long-running -is clogging the event loop. +to your worker initialization, and +it will log the delays in your event loop. > Note: it doesn't tell you what is clogging it, but it tells you > whether something is clogging it. ## Example +In one terminal, run: + ```txt -$ run `poetry run python canary/main.py` +$ poetry run python canary/run_your_workflows.py -Your activity finished after 1.4 seconds +# no output +``` + +And in another, run the following: + +```txt +$ poetry run python canary/run_worker.py + +Your activity finished after 0.5 seconds Your activity finished after 1.3 seconds -Your activity finished after 0.7 seconds -The canary showed the event loop took 0.4 seconds to get back after its await finished -Your activity finished after 1.0 seconds -Your activity finished after 0.6 seconds -Your activity finished after 0.7 seconds Your activity finished after 1.3 seconds -The canary showed the event loop took 0.7 seconds to get back after its await finished -Your activity finished after 0.9 seconds +The canary detected 1.177 seconds of event loop delay. +Your activity finished after 1.4 seconds Your activity finished after 1.1 seconds Your activity finished after 1.2 seconds -The canary showed the event loop took 0.3 seconds to get back after its await finished -Your activity finished after 1.2 seconds -Your activity finished after 1.4 seconds -Your activity finished after 0.5 seconds -The canary showed the event loop took 0.2 seconds to get back after its await finished +The canary detected 0.766 seconds of event loop delay. +Your activity finished after 1.3 seconds +Your activity finished after 0.8 seconds +Your activity finished after 1.3 seconds +The canary detected 0.472 seconds of event loop delay. +Your activity finished after 0.9 seconds +Your activity finished after 1.3 seconds Your activity finished after 1.3 seconds +The canary detected 0.603 seconds of event loop delay. Your activity finished after 1.4 seconds -Your activity finished after 1.0 seconds -The canary showed the event loop took 0.9 seconds to get back after its await finished +Your activity finished after 1.4 seconds +Your activity finished after 0.7 seconds +The canary detected 0.542 seconds of event loop delay. +Your activity finished after 1.2 seconds +Your activity finished after 0.7 seconds +Your activity finished after 0.7 seconds +Your activity finished after 0.9 seconds +The canary detected 0.658 seconds of event loop delay. +... ``` diff --git a/canary/canary_code.py b/canary/canary_code.py index 9f396c35..8eb0650d 100644 --- a/canary/canary_code.py +++ b/canary/canary_code.py @@ -22,16 +22,15 @@ async def canary_activity() -> None: t_prev = time.time() while True: await asyncio.sleep(_CANARY_WAIT_TIME) - delta = time.time() - t_prev - extra_time = delta - _CANARY_WAIT_TIME + t_new = time.time() + delay = t_new - (t_prev + _CANARY_WAIT_TIME) + t_prev = t_new # Log the extra time taken by the event loop to get back after the await # If you want, you can turn this into a histogram and show the distribution. # maybe you could even put it in your metrics. - activity.logger.info(f"The canary showed the event loop took {round(extra_time,1)} seconds to get back after its await finished") - print(f"The canary showed the event loop took {round(extra_time,1)} seconds to get back after its await finished") - t_prev = time.time() - + activity.logger.info(f"The canary detected {round(delay,3)} seconds of event loop delay.") + print(f"The canary detected {round(delay,3)} seconds of event loop delay.") @workflow.defn diff --git a/canary/main.py b/canary/run_worker.py similarity index 70% rename from canary/main.py rename to canary/run_worker.py index 0bdef229..003fdbe9 100644 --- a/canary/main.py +++ b/canary/run_worker.py @@ -19,20 +19,11 @@ async def main(): ): # add this to your code - await client.start_workflow( + await client.execute_workflow( CanaryWorkflow.run, id="canary", task_queue="canary-task-queue", ) - while True: - # simulate running your workflows - - await client.execute_workflow( - YourWorkflow.run, - id="your-workflow", - task_queue="canary-task-queue", - ) - if __name__ == "__main__": asyncio.run(main()) diff --git a/canary/run_your_workflows.py b/canary/run_your_workflows.py new file mode 100644 index 00000000..072f8303 --- /dev/null +++ b/canary/run_your_workflows.py @@ -0,0 +1,24 @@ +import asyncio + +from temporalio.client import Client +from temporalio.worker import Worker + +from canary.canary_code import CanaryWorkflow, canary_activity +from canary.your_workflows import YourWorkflow, your_activity + + + +async def main(): + client = await Client.connect("localhost:7233") + + while True: + # simulate running your workflows + + await client.execute_workflow( + YourWorkflow.run, + id="your-workflow", + task_queue="canary-task-queue", + ) + +if __name__ == "__main__": + asyncio.run(main()) From d61611e5ee4c450129448a94494824ee577a741d Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 14:55:07 -0600 Subject: [PATCH 20/27] clean canary --- canary/run_worker.py | 4 ++++ canary/run_your_workflows.py | 5 +---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/canary/run_worker.py b/canary/run_worker.py index 003fdbe9..e211fdf3 100644 --- a/canary/run_worker.py +++ b/canary/run_worker.py @@ -1,3 +1,7 @@ +""" +In your worker initialization, you can add the canary workflow. +""" + import asyncio from temporalio.client import Client diff --git a/canary/run_your_workflows.py b/canary/run_your_workflows.py index 072f8303..ac56d71c 100644 --- a/canary/run_your_workflows.py +++ b/canary/run_your_workflows.py @@ -1,11 +1,8 @@ import asyncio from temporalio.client import Client -from temporalio.worker import Worker - -from canary.canary_code import CanaryWorkflow, canary_activity -from canary.your_workflows import YourWorkflow, your_activity +from canary.your_workflows import YourWorkflow async def main(): From 52d9e3dd93dc25688438e0092bbf65f710db157c Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 15:01:11 -0600 Subject: [PATCH 21/27] format canary --- canary/canary_code.py | 52 ------------------------------------ canary/run_worker.py | 46 ++++++++++++++++++++++++++++++- canary/run_your_workflows.py | 1 + canary/your_workflows.py | 5 ++-- 4 files changed, 48 insertions(+), 56 deletions(-) delete mode 100644 canary/canary_code.py diff --git a/canary/canary_code.py b/canary/canary_code.py deleted file mode 100644 index 8eb0650d..00000000 --- a/canary/canary_code.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -You could add this to your code -""" - -from datetime import timedelta - -import asyncio -import time -from temporalio import activity -from temporalio import workflow - - -_CANARY_WAIT_TIME = 3 - - -@activity.defn -async def canary_activity() -> None: - """ - Here's the activity that can probe your worker and see if it's - still responsive. - """ - t_prev = time.time() - while True: - await asyncio.sleep(_CANARY_WAIT_TIME) - t_new = time.time() - delay = t_new - (t_prev + _CANARY_WAIT_TIME) - t_prev = t_new - - # Log the extra time taken by the event loop to get back after the await - # If you want, you can turn this into a histogram and show the distribution. - # maybe you could even put it in your metrics. - activity.logger.info(f"The canary detected {round(delay,3)} seconds of event loop delay.") - print(f"The canary detected {round(delay,3)} seconds of event loop delay.") - - -@workflow.defn -class CanaryWorkflow: - """ - Here's the workflow that can probe your worker and see if it's - still responsive. - """ - - @workflow.run - async def run(self) -> str: - - return await workflow.execute_activity( - canary_activity, - - # these timeouts are going to be tricky because if the event loop - # is indeed blocked, the heartbeats etc may not behave as expected. - start_to_close_timeout=timedelta(seconds=60 * 100), - ) diff --git a/canary/run_worker.py b/canary/run_worker.py index e211fdf3..d870b096 100644 --- a/canary/run_worker.py +++ b/canary/run_worker.py @@ -2,14 +2,57 @@ In your worker initialization, you can add the canary workflow. """ +from datetime import timedelta import asyncio +import time +from temporalio import activity, workflow from temporalio.client import Client from temporalio.worker import Worker -from canary.canary_code import CanaryWorkflow, canary_activity from canary.your_workflows import YourWorkflow, your_activity +_CANARY_CHECK_RATE = 3 + + +@activity.defn +async def canary_activity() -> None: + """ + Here's the activity that can probe your worker and see if it's + still responsive. + """ + t_prev = time.time() + while True: + await asyncio.sleep(_CANARY_CHECK_RATE) + t_new = time.time() + delay = t_new - (t_prev + _CANARY_CHECK_RATE) + t_prev = t_new + + # Log the extra time taken by the event loop to get back after the await + # If you want, you can turn this into a histogram and show the distribution. + # maybe you could even put it in your metrics. + activity.logger.info( + f"The canary detected {round(delay,3)} seconds of event loop delay." + ) + print(f"The canary detected {round(delay,3)} seconds of event loop delay.") + + +@workflow.defn +class CanaryWorkflow: + """ + Here's the workflow that can probe your worker and see if it's + still responsive. + """ + + @workflow.run + async def run(self) -> str: + + return await workflow.execute_activity( + canary_activity, + # these timeouts are going to be tricky because if the event loop + # is indeed blocked, the heartbeats etc may not behave as expected. + start_to_close_timeout=timedelta(seconds=60 * 100), + ) async def main(): @@ -29,5 +72,6 @@ async def main(): task_queue="canary-task-queue", ) + if __name__ == "__main__": asyncio.run(main()) diff --git a/canary/run_your_workflows.py b/canary/run_your_workflows.py index ac56d71c..ecc204b5 100644 --- a/canary/run_your_workflows.py +++ b/canary/run_your_workflows.py @@ -17,5 +17,6 @@ async def main(): task_queue="canary-task-queue", ) + if __name__ == "__main__": asyncio.run(main()) diff --git a/canary/your_workflows.py b/canary/your_workflows.py index 890cc959..fc5b9542 100644 --- a/canary/your_workflows.py +++ b/canary/your_workflows.py @@ -16,6 +16,7 @@ from temporalio import workflow + @activity.defn async def your_activity() -> None: """ @@ -29,7 +30,7 @@ async def your_activity() -> None: # this simulates a long-running activity. this is the piece that we don't # know if your code has it or not. This is what we're using the canary for. r = random.random() - time.sleep(.5 + r) + time.sleep(0.5 + r) # if you replace the time.sleep() with an asyncio.sleep(), # the canary will detect no blocking. @@ -38,7 +39,6 @@ async def your_activity() -> None: print(f"Your activity finished after {round(time.time() - t0,1)} seconds") - @workflow.defn class YourWorkflow: @workflow.run @@ -48,4 +48,3 @@ async def run(self) -> str: your_activity, start_to_close_timeout=timedelta(seconds=60 * 100), ) - From 8fb8c35a96893e26f52d65417d9b80abce3beb00 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 15:04:28 -0600 Subject: [PATCH 22/27] canary --- canary/run_your_workflows.py | 46 ++++++++++++++++++++++++++++++++- canary/your_workflows.py | 50 ------------------------------------ 2 files changed, 45 insertions(+), 51 deletions(-) delete mode 100644 canary/your_workflows.py diff --git a/canary/run_your_workflows.py b/canary/run_your_workflows.py index ecc204b5..68b8571c 100644 --- a/canary/run_your_workflows.py +++ b/canary/run_your_workflows.py @@ -1,8 +1,52 @@ +""" +This simulates your code. + +You can actually use your own code, but if you want +to use this code as a playground, you can change +the amount of time in the time.sleep() call in +your_activity(). +""" + import asyncio +import time +import random +from datetime import timedelta from temporalio.client import Client +from temporalio import activity, workflow + + +@activity.defn +async def your_activity() -> None: + """ + Here's the activity that's in your codebase. + + You can experiment with this one to see how it behaves. + """ + + t0 = time.time() -from canary.your_workflows import YourWorkflow + # this simulates a long-running activity. this is the piece that we don't + # know if your code has it or not. This is what we're using the canary for. + r = random.random() + time.sleep(0.5 + r) + + # if you replace the time.sleep() with an asyncio.sleep(), + # the canary will detect no blocking and show no delay. + # asyncio.sleep(.5 + r) + + print(f"Your activity finished after {round(time.time() - t0,1)} seconds") + + +@workflow.defn +class YourWorkflow: + @workflow.run + async def run(self) -> str: + + return await workflow.execute_activity( + your_activity, + start_to_close_timeout=timedelta(seconds=60 * 100), + ) async def main(): diff --git a/canary/your_workflows.py b/canary/your_workflows.py deleted file mode 100644 index fc5b9542..00000000 --- a/canary/your_workflows.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -This simulates your code. - -You can actually use your own code, but if you want -to use this code as a playground, you can change -the amount of time in the time.sleep() call in -your_activity(). -""" - -import asyncio -import time -import random -from temporalio import activity - -from datetime import timedelta - -from temporalio import workflow - - -@activity.defn -async def your_activity() -> None: - """ - Here's the activity that's in your codebase. - - You can experiment with this one to see how it behaves. - """ - - t0 = time.time() - - # this simulates a long-running activity. this is the piece that we don't - # know if your code has it or not. This is what we're using the canary for. - r = random.random() - time.sleep(0.5 + r) - - # if you replace the time.sleep() with an asyncio.sleep(), - # the canary will detect no blocking. - # asyncio.sleep(.5 + r) - - print(f"Your activity finished after {round(time.time() - t0,1)} seconds") - - -@workflow.defn -class YourWorkflow: - @workflow.run - async def run(self) -> str: - - return await workflow.execute_activity( - your_activity, - start_to_close_timeout=timedelta(seconds=60 * 100), - ) From 7eef45d307cced78d739ea324e40bb7f4a9517fe Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 15:13:21 -0600 Subject: [PATCH 23/27] canary looking good --- canary/README.md | 14 +++++++------- canary/run_worker.py | 4 ++-- .../{run_your_workflows.py => your_workflows.py} | 9 +++++---- 3 files changed, 14 insertions(+), 13 deletions(-) rename canary/{run_your_workflows.py => your_workflows.py} (87%) diff --git a/canary/README.md b/canary/README.md index 1bafcff7..f0af6092 100644 --- a/canary/README.md +++ b/canary/README.md @@ -14,7 +14,7 @@ it will log the delays in your event loop. In one terminal, run: ```txt -$ poetry run python canary/run_your_workflows.py +$ poetry run python canary/your_workflows.py # no output ``` @@ -27,27 +27,27 @@ $ poetry run python canary/run_worker.py Your activity finished after 0.5 seconds Your activity finished after 1.3 seconds Your activity finished after 1.3 seconds -The canary detected 1.177 seconds of event loop delay. +The canary detected 1.1774 seconds of event loop delay. Your activity finished after 1.4 seconds Your activity finished after 1.1 seconds Your activity finished after 1.2 seconds -The canary detected 0.766 seconds of event loop delay. +The canary detected 0.7662 seconds of event loop delay. Your activity finished after 1.3 seconds Your activity finished after 0.8 seconds Your activity finished after 1.3 seconds -The canary detected 0.472 seconds of event loop delay. +The canary detected 0.4724 seconds of event loop delay. Your activity finished after 0.9 seconds Your activity finished after 1.3 seconds Your activity finished after 1.3 seconds -The canary detected 0.603 seconds of event loop delay. +The canary detected 0.6033 seconds of event loop delay. Your activity finished after 1.4 seconds Your activity finished after 1.4 seconds Your activity finished after 0.7 seconds -The canary detected 0.542 seconds of event loop delay. +The canary detected 0.5424 seconds of event loop delay. Your activity finished after 1.2 seconds Your activity finished after 0.7 seconds Your activity finished after 0.7 seconds Your activity finished after 0.9 seconds -The canary detected 0.658 seconds of event loop delay. +The canary detected 0.6584 seconds of event loop delay. ... ``` diff --git a/canary/run_worker.py b/canary/run_worker.py index d870b096..bfa150ec 100644 --- a/canary/run_worker.py +++ b/canary/run_worker.py @@ -32,9 +32,9 @@ async def canary_activity() -> None: # If you want, you can turn this into a histogram and show the distribution. # maybe you could even put it in your metrics. activity.logger.info( - f"The canary detected {round(delay,3)} seconds of event loop delay." + f"The canary detected {round(delay,4)} seconds of event loop delay." ) - print(f"The canary detected {round(delay,3)} seconds of event loop delay.") + print(f"The canary detected {round(delay,4)} seconds of event loop delay.") @workflow.defn diff --git a/canary/run_your_workflows.py b/canary/your_workflows.py similarity index 87% rename from canary/run_your_workflows.py rename to canary/your_workflows.py index 68b8571c..94587cc0 100644 --- a/canary/run_your_workflows.py +++ b/canary/your_workflows.py @@ -28,12 +28,13 @@ async def your_activity() -> None: # this simulates a long-running activity. this is the piece that we don't # know if your code has it or not. This is what we're using the canary for. + # + # to illustrate the difference, comment out time.sleep() and uncomment + # the asyncio.sleep() call. + # the canary will detect very little delay. r = random.random() time.sleep(0.5 + r) - - # if you replace the time.sleep() with an asyncio.sleep(), - # the canary will detect no blocking and show no delay. - # asyncio.sleep(.5 + r) + # await asyncio.sleep(.5 + r) print(f"Your activity finished after {round(time.time() - t0,1)} seconds") From b16977df0c75f4106024606a0990455840e8f3db Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 15:16:57 -0600 Subject: [PATCH 24/27] rename canary variable --- canary/run_worker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/canary/run_worker.py b/canary/run_worker.py index bfa150ec..19b02276 100644 --- a/canary/run_worker.py +++ b/canary/run_worker.py @@ -12,7 +12,7 @@ from canary.your_workflows import YourWorkflow, your_activity -_CANARY_CHECK_RATE = 3 +_SECONDS_BETWEEN_CANARY_CHECKS = 3 @activity.defn @@ -23,9 +23,9 @@ async def canary_activity() -> None: """ t_prev = time.time() while True: - await asyncio.sleep(_CANARY_CHECK_RATE) + await asyncio.sleep(_SECONDS_BETWEEN_CANARY_CHECKS) t_new = time.time() - delay = t_new - (t_prev + _CANARY_CHECK_RATE) + delay = t_new - (t_prev + _SECONDS_BETWEEN_CANARY_CHECKS) t_prev = t_new # Log the extra time taken by the event loop to get back after the await From 4a20e2c544be6f4bb417d81fd75d7e7f4546bb9f Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Sat, 15 Feb 2025 15:18:54 -0600 Subject: [PATCH 25/27] canary good --- canary/README.md | 2 +- canary/{run_worker.py => run_canary_worker.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename canary/{run_worker.py => run_canary_worker.py} (100%) diff --git a/canary/README.md b/canary/README.md index f0af6092..d3e4064f 100644 --- a/canary/README.md +++ b/canary/README.md @@ -22,7 +22,7 @@ $ poetry run python canary/your_workflows.py And in another, run the following: ```txt -$ poetry run python canary/run_worker.py +$ poetry run python canary/run_canary_worker.py Your activity finished after 0.5 seconds Your activity finished after 1.3 seconds diff --git a/canary/run_worker.py b/canary/run_canary_worker.py similarity index 100% rename from canary/run_worker.py rename to canary/run_canary_worker.py From ec57990b035e778f5bd448dcfbdf574a7dff94c7 Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Fri, 21 Feb 2025 15:34:23 -0600 Subject: [PATCH 26/27] added reversal coordinator --- reversal_coordinator/reversal_coordinator.py | 31 +++++++++++ reversal_coordinator/reversal_workflow.py | 56 ++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 reversal_coordinator/reversal_coordinator.py create mode 100644 reversal_coordinator/reversal_workflow.py diff --git a/reversal_coordinator/reversal_coordinator.py b/reversal_coordinator/reversal_coordinator.py new file mode 100644 index 00000000..05f6ad3e --- /dev/null +++ b/reversal_coordinator/reversal_coordinator.py @@ -0,0 +1,31 @@ +from temporalio import workflow + + +@workflow.defn +class ReversalCoordinator: + + def __init__(self, list_of_ids: list): + + self.list_of_ids = list_of_ids + self.exit = False + + @workflow.run + async def run(self): + + await condition(self.exit) + + @workflow.signal + def registerworkflowid(self, workflow_id: str): + self.list_of_ids.append(workflow_id) + + if continue_as_new_recommended + continue_as_new(self.list_of_ids) + + + @workflow.signal + def decision_made(self, decision: bool): + for id in self.list_of_ids: + handle = get_workflow_handle(id) + handle.send_signal("decision_made", decision) + + self.exit = True diff --git a/reversal_coordinator/reversal_workflow.py b/reversal_coordinator/reversal_workflow.py new file mode 100644 index 00000000..aee34f69 --- /dev/null +++ b/reversal_coordinator/reversal_workflow.py @@ -0,0 +1,56 @@ +from temporalio import workflow, activity + + +@workflow.defn +class ReversalWorkflow: + + def __init__(self, merchant): + self.merchant = merchant + self.decision_received = False + self.decision_should_continue_reversal = False + + @workflow.run + async def run(self): + + if rate_limited(): + reversal_coordinator_handle = get_or_make_reversal_coordinator() + + register_with_handler(reversal_coordinator_handle) + + await condition(self.decision_received) + + if self.decision_should_continue_reversal: + # continue reversal + pass + else: + # complete + pass + + @activity.defn + def get_or_make_reversal_coordinator(self): + + while True: + try: + workflow_handle = get_workflow_handle("reversal-coordinator-" + self.merchant) + return workflow_handle + except WorkflowExecutionNotFound: + try: + workflow_handle = start_child_workflow( + ReversalCoordinator.run, + arg=[], + id="reversal-coordinator-" + self.merchant, + task_queue="reversal-coordinator-task-queue", + parent_close_policy='abandon' + ) + return workflow_handle + except WorkflowExecutionAlreadyStarted: + continue + + @activity.defn + def register_with_handler(self, handle): + handle.signal("registerworkflowid", workflow.id) + + @workflow.signal + def decision_made(self, decision_result: bool): + self.decision_received = True + self.decision_should_continue_reversal = decision_result From 7723e127492ca4df2f7925d896d5f9295d2735bf Mon Sep 17 00:00:00 2001 From: GSmithApps Date: Fri, 21 Feb 2025 16:04:22 -0600 Subject: [PATCH 27/27] remove reversal coordinator --- reversal_coordinator/reversal_coordinator.py | 31 ----------- reversal_coordinator/reversal_workflow.py | 56 -------------------- 2 files changed, 87 deletions(-) delete mode 100644 reversal_coordinator/reversal_coordinator.py delete mode 100644 reversal_coordinator/reversal_workflow.py diff --git a/reversal_coordinator/reversal_coordinator.py b/reversal_coordinator/reversal_coordinator.py deleted file mode 100644 index 05f6ad3e..00000000 --- a/reversal_coordinator/reversal_coordinator.py +++ /dev/null @@ -1,31 +0,0 @@ -from temporalio import workflow - - -@workflow.defn -class ReversalCoordinator: - - def __init__(self, list_of_ids: list): - - self.list_of_ids = list_of_ids - self.exit = False - - @workflow.run - async def run(self): - - await condition(self.exit) - - @workflow.signal - def registerworkflowid(self, workflow_id: str): - self.list_of_ids.append(workflow_id) - - if continue_as_new_recommended - continue_as_new(self.list_of_ids) - - - @workflow.signal - def decision_made(self, decision: bool): - for id in self.list_of_ids: - handle = get_workflow_handle(id) - handle.send_signal("decision_made", decision) - - self.exit = True diff --git a/reversal_coordinator/reversal_workflow.py b/reversal_coordinator/reversal_workflow.py deleted file mode 100644 index aee34f69..00000000 --- a/reversal_coordinator/reversal_workflow.py +++ /dev/null @@ -1,56 +0,0 @@ -from temporalio import workflow, activity - - -@workflow.defn -class ReversalWorkflow: - - def __init__(self, merchant): - self.merchant = merchant - self.decision_received = False - self.decision_should_continue_reversal = False - - @workflow.run - async def run(self): - - if rate_limited(): - reversal_coordinator_handle = get_or_make_reversal_coordinator() - - register_with_handler(reversal_coordinator_handle) - - await condition(self.decision_received) - - if self.decision_should_continue_reversal: - # continue reversal - pass - else: - # complete - pass - - @activity.defn - def get_or_make_reversal_coordinator(self): - - while True: - try: - workflow_handle = get_workflow_handle("reversal-coordinator-" + self.merchant) - return workflow_handle - except WorkflowExecutionNotFound: - try: - workflow_handle = start_child_workflow( - ReversalCoordinator.run, - arg=[], - id="reversal-coordinator-" + self.merchant, - task_queue="reversal-coordinator-task-queue", - parent_close_policy='abandon' - ) - return workflow_handle - except WorkflowExecutionAlreadyStarted: - continue - - @activity.defn - def register_with_handler(self, handle): - handle.signal("registerworkflowid", workflow.id) - - @workflow.signal - def decision_made(self, decision_result: bool): - self.decision_received = True - self.decision_should_continue_reversal = decision_result