deepankarm · deepankarm · Jun 9, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,7 @@ dev = [
     "mkdocs-static-i18n>=1.3.0",
     "eval-type-backport>=0.2.2",
     "fastapi >= 0.110.0, <1",
+    "pyleak>=0.1.10",
 ]
 
 [tool.uv.workspace]
@@ -133,6 +134,7 @@ filterwarnings = [
 ]
 markers = [
     "allow_call_model_methods: mark test as allowing calls to real model implementations",
+    "no_leaks: detect asyncio task leaks, thread leaks, and event loop blocking",
 ]
 
 [tool.inline-snapshot]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -53,3 +53,9 @@ def failing_version(*args, **kwargs):
     monkeypatch.setattr(OpenAIResponsesModel, "stream_response", failing_version)
     monkeypatch.setattr(OpenAIChatCompletionsModel, "get_response", failing_version)
     monkeypatch.setattr(OpenAIChatCompletionsModel, "stream_response", failing_version)
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "no_leaks: detect asyncio task leaks, thread leaks, and event loop blocking"
+    )
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
@@ -125,6 +125,39 @@ async def test_tool_call_runs():
     )
 
 
+@pytest.mark.no_leaks(tasks=True, blocking=True, blocking_threshold=0.2, threads=False)
+@pytest.mark.asyncio
+async def test_tool_call_run_blocking():
+    model = FakeModel()
+    agent = Agent(
+        name="test",
+        model=model,
+        tools=[get_function_tool("foo", "tool_result", blocking=True)],
+    )
+
+    model.add_multiple_turn_outputs(
+        [
+            # First turn: a message and tool call
+            [get_text_message("a_message"), get_function_tool_call("foo", json.dumps({"a": "b"}))],
+            # Second turn: text message
+            [get_text_message("done")],
+        ]
+    )
+
+    result = await Runner.run(agent, input="user_message")
+
+    assert result.final_output == "done"
+    assert len(result.raw_responses) == 2, (
+        "should have two responses: the first which produces a tool call, and the second which"
+        "handles the tool result"
+    )
+
+    assert len(result.to_input_list()) == 5, (
+        "should have five inputs: the original input, the message, the tool call, the tool result "
+        "and the done message"
+    )
+
+
 @pytest.mark.asyncio
 async def test_handoffs():
     model = FakeModel()
@@ -256,9 +289,9 @@ async def test_handoff_filters():
 
     assert result.final_output == "last"
     assert len(result.raw_responses) == 2, "should have two model responses"
-    assert len(result.to_input_list()) == 2, (
-        "should only have 2 inputs: orig input and last message"
-    )
+    assert (
+        len(result.to_input_list()) == 2
+    ), "should only have 2 inputs: orig input and last message"
 
 
 @pytest.mark.asyncio
@@ -592,9 +625,9 @@ async def test_tool_use_behavior_first_output():
 
     result = await Runner.run(agent, input="user_message")
 
-    assert result.final_output == Foo(bar="tool_one_result"), (
-        "should have used the first tool result"
-    )
+    assert result.final_output == Foo(
+        bar="tool_one_result"
+    ), "should have used the first tool result"
 
 
 def custom_tool_use_behavior(

diff --git a/tests/test_responses.py b/tests/test_responses.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import time
 from typing import Any
 
 from openai.types.responses import (
@@ -37,9 +38,14 @@ def get_text_message(content: str) -> ResponseOutputItem:
 
 
 def get_function_tool(
-    name: str | None = None, return_value: str | None = None, hide_errors: bool = False
+    name: str | None = None,
+    return_value: str | None = None,
+    hide_errors: bool = False,
+    blocking: bool = False,
 ) -> FunctionTool:
     def _foo() -> str:
+        if blocking:
+            time.sleep(1)
         return return_value or "result_ok"
 
     return function_tool(