diff --git a/canary/README.md b/canary/README.md new file mode 100644 index 00000000..d3e4064f --- /dev/null +++ b/canary/README.md @@ -0,0 +1,53 @@ +# Event Loop Canary Sample + +This can help you determine if your event loop is clogged. + +The idea is that you could add this canary workflow +to your worker initialization, and +it will log the delays in your event loop. + +> Note: it doesn't tell you what is clogging it, but it tells you +> whether something is clogging it. + +## Example + +In one terminal, run: + +```txt +$ poetry run python canary/your_workflows.py + +# no output +``` + +And in another, run the following: + +```txt +$ poetry run python canary/run_canary_worker.py + +Your activity finished after 0.5 seconds +Your activity finished after 1.3 seconds +Your activity finished after 1.3 seconds +The canary detected 1.1774 seconds of event loop delay. +Your activity finished after 1.4 seconds +Your activity finished after 1.1 seconds +Your activity finished after 1.2 seconds +The canary detected 0.7662 seconds of event loop delay. +Your activity finished after 1.3 seconds +Your activity finished after 0.8 seconds +Your activity finished after 1.3 seconds +The canary detected 0.4724 seconds of event loop delay. +Your activity finished after 0.9 seconds +Your activity finished after 1.3 seconds +Your activity finished after 1.3 seconds +The canary detected 0.6033 seconds of event loop delay. +Your activity finished after 1.4 seconds +Your activity finished after 1.4 seconds +Your activity finished after 0.7 seconds +The canary detected 0.5424 seconds of event loop delay. +Your activity finished after 1.2 seconds +Your activity finished after 0.7 seconds +Your activity finished after 0.7 seconds +Your activity finished after 0.9 seconds +The canary detected 0.6584 seconds of event loop delay. +... +``` diff --git a/canary/run_canary_worker.py b/canary/run_canary_worker.py new file mode 100644 index 00000000..19b02276 --- /dev/null +++ b/canary/run_canary_worker.py @@ -0,0 +1,77 @@ +""" +In your worker initialization, you can add the canary workflow. +""" + +from datetime import timedelta +import asyncio +import time + +from temporalio import activity, workflow +from temporalio.client import Client +from temporalio.worker import Worker + +from canary.your_workflows import YourWorkflow, your_activity + +_SECONDS_BETWEEN_CANARY_CHECKS = 3 + + +@activity.defn +async def canary_activity() -> None: + """ + Here's the activity that can probe your worker and see if it's + still responsive. + """ + t_prev = time.time() + while True: + await asyncio.sleep(_SECONDS_BETWEEN_CANARY_CHECKS) + t_new = time.time() + delay = t_new - (t_prev + _SECONDS_BETWEEN_CANARY_CHECKS) + t_prev = t_new + + # Log the extra time taken by the event loop to get back after the await + # If you want, you can turn this into a histogram and show the distribution. + # maybe you could even put it in your metrics. + activity.logger.info( + f"The canary detected {round(delay,4)} seconds of event loop delay." + ) + print(f"The canary detected {round(delay,4)} seconds of event loop delay.") + + +@workflow.defn +class CanaryWorkflow: + """ + Here's the workflow that can probe your worker and see if it's + still responsive. + """ + + @workflow.run + async def run(self) -> str: + + return await workflow.execute_activity( + canary_activity, + # these timeouts are going to be tricky because if the event loop + # is indeed blocked, the heartbeats etc may not behave as expected. + start_to_close_timeout=timedelta(seconds=60 * 100), + ) + + +async def main(): + client = await Client.connect("localhost:7233") + + async with Worker( + client, + task_queue="canary-task-queue", + workflows=[CanaryWorkflow, YourWorkflow], + activities=[canary_activity, your_activity], + ): + + # add this to your code + await client.execute_workflow( + CanaryWorkflow.run, + id="canary", + task_queue="canary-task-queue", + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/canary/your_workflows.py b/canary/your_workflows.py new file mode 100644 index 00000000..94587cc0 --- /dev/null +++ b/canary/your_workflows.py @@ -0,0 +1,67 @@ +""" +This simulates your code. + +You can actually use your own code, but if you want +to use this code as a playground, you can change +the amount of time in the time.sleep() call in +your_activity(). +""" + +import asyncio +import time +import random +from datetime import timedelta + +from temporalio.client import Client +from temporalio import activity, workflow + + +@activity.defn +async def your_activity() -> None: + """ + Here's the activity that's in your codebase. + + You can experiment with this one to see how it behaves. + """ + + t0 = time.time() + + # this simulates a long-running activity. this is the piece that we don't + # know if your code has it or not. This is what we're using the canary for. + # + # to illustrate the difference, comment out time.sleep() and uncomment + # the asyncio.sleep() call. + # the canary will detect very little delay. + r = random.random() + time.sleep(0.5 + r) + # await asyncio.sleep(.5 + r) + + print(f"Your activity finished after {round(time.time() - t0,1)} seconds") + + +@workflow.defn +class YourWorkflow: + @workflow.run + async def run(self) -> str: + + return await workflow.execute_activity( + your_activity, + start_to_close_timeout=timedelta(seconds=60 * 100), + ) + + +async def main(): + client = await Client.connect("localhost:7233") + + while True: + # simulate running your workflows + + await client.execute_workflow( + YourWorkflow.run, + id="your-workflow", + task_queue="canary-task-queue", + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/prometheus/README.md b/prometheus/README.md index 091d5171..51bb27b3 100644 --- a/prometheus/README.md +++ b/prometheus/README.md @@ -13,5 +13,7 @@ Then, in another terminal, run the following to execute a workflow: uv run starter.py -After executing the workflow, the process will stay open so the metrics if this separate process can be accessed at +After executing the workflow, the process will stay open so the metrics will be visible. + +This separate process can be accessed at http://127.0.0.1:9001/metrics. \ No newline at end of file