[3/n] Config and items for realtime (openai#1070)

rm-openai · web-flow · commit 78675ff94d79 · 2025-07-11T13:08:56.000-04:00
Similar to the TS version. - Config is the things you can set on the session - Items are similar to responses items. I'm using an abstraction instead of reusing the ones in the openai SDK, to reduce the amount of work for other providers --- [//]: # (BEGIN SAPLING FOOTER) * openai#1074 * openai#1073 * openai#1072 * openai#1071 * __->__ openai#1070 * openai#1069 * openai#1068
diff --git a/src/agents/model_settings.py b/src/agents/model_settings.py
@@ -45,6 +45,7 @@ def validate_from_none(value: None) -> _Omit:
 
 Omit = Annotated[_Omit, _OmitTypeAnnotation]
 Headers: TypeAlias = Mapping[str, Union[str, Omit]]
+ToolChoice: TypeAlias = Union[Literal["auto", "required", "none"], str, None]
 
 
 @dataclass
@@ -70,7 +71,7 @@ class ModelSettings:
     presence_penalty: float | None = None
     """The presence penalty to use when calling the model."""
 
-    tool_choice: Literal["auto", "required", "none"] | str | None = None
+    tool_choice: ToolChoice | None = None
     """The tool choice to use when calling the model."""
 
     parallel_tool_calls: bool | None = None
diff --git a/src/agents/realtime/config.py b/src/agents/realtime/config.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import inspect
+from typing import (
+    Any,
+    Callable,
+    Literal,
+    Union,
+)
+
+from typing_extensions import NotRequired, TypeAlias, TypedDict
+
+from ..model_settings import ToolChoice
+from ..tool import FunctionTool
+from ..util._types import MaybeAwaitable
+
+
+class RealtimeClientMessage(TypedDict):
+    type: str  # explicitly required
+    other_data: NotRequired[dict[str, Any]]
+
+
+class UserInputText(TypedDict):
+    type: Literal["input_text"]
+    text: str
+
+
+class RealtimeUserInputMessage(TypedDict):
+    type: Literal["message"]
+    role: Literal["user"]
+    content: list[UserInputText]
+
+
+RealtimeUserInput: TypeAlias = Union[str, RealtimeUserInputMessage]
+
+
+RealtimeAudioFormat: TypeAlias = Union[Literal["pcm16", "g711_ulaw", "g711_alaw"], str]
+
+
+class RealtimeInputAudioTranscriptionConfig(TypedDict):
+    language: NotRequired[str]
+    model: NotRequired[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"] | str]
+    prompt: NotRequired[str]
+
+
+class RealtimeTurnDetectionConfig(TypedDict):
+    """Turn detection config. Allows extra vendor keys if needed."""
+
+    type: NotRequired[Literal["semantic_vad", "server_vad"]]
+    create_response: NotRequired[bool]
+    eagerness: NotRequired[Literal["auto", "low", "medium", "high"]]
+    interrupt_response: NotRequired[bool]
+    prefix_padding_ms: NotRequired[int]
+    silence_duration_ms: NotRequired[int]
+    threshold: NotRequired[float]
+
+
+class RealtimeSessionConfig(TypedDict):
+    api_key: NotRequired[APIKeyOrKeyFunc]
+    model: NotRequired[str]
+    instructions: NotRequired[str]
+    modalities: NotRequired[list[Literal["text", "audio"]]]
+    voice: NotRequired[str]
+
+    input_audio_format: NotRequired[RealtimeAudioFormat]
+    output_audio_format: NotRequired[RealtimeAudioFormat]
+    input_audio_transcription: NotRequired[RealtimeInputAudioTranscriptionConfig]
+    turn_detection: NotRequired[RealtimeTurnDetectionConfig]
+
+    tool_choice: NotRequired[ToolChoice]
+    tools: NotRequired[list[FunctionTool]]
+
+
+APIKeyOrKeyFunc = str | Callable[[], MaybeAwaitable[str]]
+"""Either an API key or a function that returns an API key."""
+
+
+async def get_api_key(key: APIKeyOrKeyFunc | None) -> str | None:
+    """Get the API key from the key or key function."""
+    if key is None:
+        return None
+    elif isinstance(key, str):
+        return key
+
+    result = key()
+    if inspect.isawaitable(result):
+        return await result
+    return result
+
+    # TODO (rm) Add tracing support
+    # tracing: NotRequired[RealtimeTracingConfig | None]
diff --git a/src/agents/realtime/items.py b/src/agents/realtime/items.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+from typing import Annotated, Literal, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class InputText(BaseModel):
+    type: Literal["input_text"] = "input_text"
+    text: str
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+class InputAudio(BaseModel):
+    type: Literal["input_audio"] = "input_audio"
+    audio: str | None = None
+    transcript: str | None = None
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+class AssistantText(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+class AssistantAudio(BaseModel):
+    type: Literal["audio"] = "audio"
+    audio: str | None = None
+    transcript: str | None = None
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+class SystemMessageItem(BaseModel):
+    item_id: str
+    previous_item_id: str | None = None
+    type: Literal["message"] = "message"
+    role: Literal["system"] = "system"
+    content: list[InputText]
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+class UserMessageItem(BaseModel):
+    item_id: str
+    previous_item_id: str | None = None
+    type: Literal["message"] = "message"
+    role: Literal["user"] = "user"
+    content: list[InputText | InputAudio]
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+class AssistantMessageItem(BaseModel):
+    item_id: str
+    previous_item_id: str | None = None
+    type: Literal["message"] = "message"
+    role: Literal["assistant"] = "assistant"
+    status: Literal["in_progress", "completed", "incomplete"] | None = None
+    content: list[AssistantText | AssistantAudio]
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+RealtimeMessageItem = Annotated[
+    Union[SystemMessageItem, UserMessageItem, AssistantMessageItem],
+    Field(discriminator="role"),
+]
+
+
+class RealtimeToolCallItem(BaseModel):
+    item_id: str
+    previous_item_id: str | None = None
+    type: Literal["function_call"] = "function_call"
+    status: Literal["in_progress", "completed"]
+    arguments: str
+    name: str
+    output: str | None = None
+
+    # Allow extra data
+    model_config = ConfigDict(extra="allow")
+
+
+RealtimeItem = RealtimeMessageItem | RealtimeToolCallItem
+
+
+class RealtimeResponse(BaseModel):
+    id: str
+    output: list[RealtimeMessageItem]