From 4756247cee3d9548397b26a29109e76cc9522379 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Fri, 12 Sep 2025 15:51:27 -0400 Subject: [PATCH] release: 1.107.2 (#2624) * chore(api): Minor docs and type updates for realtime * codegen metadata * chore(tests): simplify `get_platform` test `nest_asyncio` is archived and broken on some platforms so it's not worth keeping in our test suite. * release: 1.107.2 --------- Co-authored-by: stainless-app[bot] <142633134+stainless-app[bot]@users.noreply.github.com> --- .release-please-manifest.json | 2 +- .stats.yml | 4 +- CHANGELOG.md | 9 +++ pyproject.toml | 2 +- requirements-dev.lock | 3 +- src/openai/_version.py | 2 +- src/openai/resources/responses/responses.py | 48 ++++++------ .../input_audio_buffer_timeout_triggered.py | 10 ++- .../realtime/realtime_audio_config_input.py | 7 +- .../realtime_audio_config_input_param.py | 10 ++- .../realtime_audio_input_turn_detection.py | 68 ++++++++++++----- ...altime_audio_input_turn_detection_param.py | 65 +++++++++++----- .../realtime_session_create_response.py | 74 ++++++++++++++----- ...ltime_transcription_session_audio_input.py | 7 +- ...transcription_session_audio_input_param.py | 10 ++- ...tion_session_audio_input_turn_detection.py | 67 +++++++++++++---- ...ession_audio_input_turn_detection_param.py | 64 ++++++++++++---- src/openai/types/responses/response.py | 8 +- .../types/responses/response_create_params.py | 8 +- .../realtime/test_client_secrets.py | 10 +-- tests/test_client.py | 53 ++----------- 21 files changed, 344 insertions(+), 187 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 25880b2e7b..32e0d8892c 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "1.107.1" + ".": "1.107.2" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index 2aa16be875..e389718967 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 118 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml -openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-94b1e3cb0bdc616ff0c2f267c33dadd95f133b1f64e647aab6c64afb292b2793.yml +openapi_spec_hash: 2395319ac9befd59b6536ae7f9564a05 config_hash: 930dac3aa861344867e4ac84f037b5df diff --git a/CHANGELOG.md b/CHANGELOG.md index 19eab7da7e..31ccac5195 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## 1.107.2 (2025-09-12) + +Full Changelog: [v1.107.1...v1.107.2](https://github.com/openai/openai-python/compare/v1.107.1...v1.107.2) + +### Chores + +* **api:** Minor docs and type updates for realtime ([ab6a10d](https://github.com/openai/openai-python/commit/ab6a10da4ed7e6386695b6f5f29149d4870f85c9)) +* **tests:** simplify `get_platform` test ([01f03e0](https://github.com/openai/openai-python/commit/01f03e0ad1f9ab3f2ed8b7c13d652263c6d06378)) + ## 1.107.1 (2025-09-10) Full Changelog: [v1.107.0...v1.107.1](https://github.com/openai/openai-python/compare/v1.107.0...v1.107.1) diff --git a/pyproject.toml b/pyproject.toml index 326dc5a004..7cb1ef4f76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openai" -version = "1.107.1" +version = "1.107.2" description = "The official Python library for the openai API" dynamic = ["readme"] license = "Apache-2.0" diff --git a/requirements-dev.lock b/requirements-dev.lock index 7d690683e9..eaf136f7e6 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -70,7 +70,7 @@ filelock==3.12.4 frozenlist==1.7.0 # via aiohttp # via aiosignal -griffe==1.14.0 +griffe==1.13.0 h11==0.16.0 # via httpcore httpcore==1.0.9 @@ -108,7 +108,6 @@ multidict==6.5.0 mypy==1.14.1 mypy-extensions==1.0.0 # via mypy -nest-asyncio==1.6.0 nodeenv==1.8.0 # via pyright nox==2023.4.22 diff --git a/src/openai/_version.py b/src/openai/_version.py index f337b21cd5..70f9958885 100644 --- a/src/openai/_version.py +++ b/src/openai/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "openai" -__version__ = "1.107.1" # x-release-please-version +__version__ = "1.107.2" # x-release-please-version diff --git a/src/openai/resources/responses/responses.py b/src/openai/resources/responses/responses.py index 837d2b2211..8acdb10b51 100644 --- a/src/openai/resources/responses/responses.py +++ b/src/openai/resources/responses/responses.py @@ -288,10 +288,10 @@ def create( truncation: The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use @@ -527,10 +527,10 @@ def create( truncation: The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use @@ -766,10 +766,10 @@ def create( truncation: The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use @@ -1719,10 +1719,10 @@ async def create( truncation: The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use @@ -1958,10 +1958,10 @@ async def create( truncation: The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use @@ -2197,10 +2197,10 @@ async def create( truncation: The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use diff --git a/src/openai/types/realtime/input_audio_buffer_timeout_triggered.py b/src/openai/types/realtime/input_audio_buffer_timeout_triggered.py index ed592ac06b..5c5dc5cfa6 100644 --- a/src/openai/types/realtime/input_audio_buffer_timeout_triggered.py +++ b/src/openai/types/realtime/input_audio_buffer_timeout_triggered.py @@ -9,10 +9,16 @@ class InputAudioBufferTimeoutTriggered(BaseModel): audio_end_ms: int - """Millisecond offset where speech ended within the buffered audio.""" + """ + Millisecond offset of audio written to the input audio buffer at the time the + timeout was triggered. + """ audio_start_ms: int - """Millisecond offset where speech started within the buffered audio.""" + """ + Millisecond offset of audio written to the input audio buffer that was after the + playback time of the last model response. + """ event_id: str """The unique ID of the server event.""" diff --git a/src/openai/types/realtime/realtime_audio_config_input.py b/src/openai/types/realtime/realtime_audio_config_input.py index fd96e2a52d..cfcb7f22d4 100644 --- a/src/openai/types/realtime/realtime_audio_config_input.py +++ b/src/openai/types/realtime/realtime_audio_config_input.py @@ -49,8 +49,11 @@ class RealtimeAudioConfigInput(BaseModel): """Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. + trigger model response. + + Server VAD means that the model will detect the start and end of speech based on + audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio diff --git a/src/openai/types/realtime/realtime_audio_config_input_param.py b/src/openai/types/realtime/realtime_audio_config_input_param.py index 1dfb439006..730f46cfec 100644 --- a/src/openai/types/realtime/realtime_audio_config_input_param.py +++ b/src/openai/types/realtime/realtime_audio_config_input_param.py @@ -2,6 +2,7 @@ from __future__ import annotations +from typing import Optional from typing_extensions import TypedDict from .noise_reduction_type import NoiseReductionType @@ -46,12 +47,15 @@ class RealtimeAudioConfigInputParam(TypedDict, total=False): transcription, these offer additional guidance to the transcription service. """ - turn_detection: RealtimeAudioInputTurnDetectionParam + turn_detection: Optional[RealtimeAudioInputTurnDetectionParam] """Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. + trigger model response. + + Server VAD means that the model will detect the start and end of speech based on + audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio diff --git a/src/openai/types/realtime/realtime_audio_input_turn_detection.py b/src/openai/types/realtime/realtime_audio_input_turn_detection.py index 1c736ab2b7..d3f4e00316 100644 --- a/src/openai/types/realtime/realtime_audio_input_turn_detection.py +++ b/src/openai/types/realtime/realtime_audio_input_turn_detection.py @@ -1,33 +1,38 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Optional -from typing_extensions import Literal +from typing import Union, Optional +from typing_extensions import Literal, Annotated, TypeAlias +from ..._utils import PropertyInfo from ..._models import BaseModel -__all__ = ["RealtimeAudioInputTurnDetection"] +__all__ = ["RealtimeAudioInputTurnDetection", "ServerVad", "SemanticVad"] -class RealtimeAudioInputTurnDetection(BaseModel): +class ServerVad(BaseModel): + type: Literal["server_vad"] + """Type of turn detection, `server_vad` to turn on simple Server VAD.""" + create_response: Optional[bool] = None """ Whether or not to automatically generate a response when a VAD stop event occurs. """ - eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None - """Used only for `semantic_vad` mode. + idle_timeout_ms: Optional[int] = None + """Optional timeout after which a model response will be triggered automatically. - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, - 4s, and 2s respectively. - """ + This is useful for situations in which a long pause from the user is unexpected, + such as a phone call. The model will effectively prompt the user to continue the + conversation based on the current context. - idle_timeout_ms: Optional[int] = None - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received and emits a `timeout_triggered` event. + The timeout value will be applied after the last model response's audio has + finished playing, i.e. it's set to the `response.done` time plus audio playback + duration. + + An `input_audio_buffer.timeout_triggered` event (plus events associated with the + Response) will be emitted when the timeout is reached. Idle timeout is currently + only supported for `server_vad` mode. """ interrupt_response: Optional[bool] = None @@ -60,5 +65,34 @@ class RealtimeAudioInputTurnDetection(BaseModel): perform better in noisy environments. """ - type: Optional[Literal["server_vad", "semantic_vad"]] = None - """Type of turn detection.""" + +class SemanticVad(BaseModel): + type: Literal["semantic_vad"] + """Type of turn detection, `semantic_vad` to turn on Semantic VAD.""" + + create_response: Optional[bool] = None + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + interrupt_response: Optional[bool] = None + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + +RealtimeAudioInputTurnDetection: TypeAlias = Annotated[ + Union[ServerVad, SemanticVad, None], PropertyInfo(discriminator="type") +] diff --git a/src/openai/types/realtime/realtime_audio_input_turn_detection_param.py b/src/openai/types/realtime/realtime_audio_input_turn_detection_param.py index 79cabec708..09b8cfd159 100644 --- a/src/openai/types/realtime/realtime_audio_input_turn_detection_param.py +++ b/src/openai/types/realtime/realtime_audio_input_turn_detection_param.py @@ -2,32 +2,36 @@ from __future__ import annotations -from typing import Optional -from typing_extensions import Literal, TypedDict +from typing import Union, Optional +from typing_extensions import Literal, Required, TypeAlias, TypedDict -__all__ = ["RealtimeAudioInputTurnDetectionParam"] +__all__ = ["RealtimeAudioInputTurnDetectionParam", "ServerVad", "SemanticVad"] -class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False): +class ServerVad(TypedDict, total=False): + type: Required[Literal["server_vad"]] + """Type of turn detection, `server_vad` to turn on simple Server VAD.""" + create_response: bool """ Whether or not to automatically generate a response when a VAD stop event occurs. """ - eagerness: Literal["low", "medium", "high", "auto"] - """Used only for `semantic_vad` mode. + idle_timeout_ms: Optional[int] + """Optional timeout after which a model response will be triggered automatically. - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, - 4s, and 2s respectively. - """ + This is useful for situations in which a long pause from the user is unexpected, + such as a phone call. The model will effectively prompt the user to continue the + conversation based on the current context. - idle_timeout_ms: Optional[int] - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received and emits a `timeout_triggered` event. + The timeout value will be applied after the last model response's audio has + finished playing, i.e. it's set to the `response.done` time plus audio playback + duration. + + An `input_audio_buffer.timeout_triggered` event (plus events associated with the + Response) will be emitted when the timeout is reached. Idle timeout is currently + only supported for `server_vad` mode. """ interrupt_response: bool @@ -60,5 +64,32 @@ class RealtimeAudioInputTurnDetectionParam(TypedDict, total=False): perform better in noisy environments. """ - type: Literal["server_vad", "semantic_vad"] - """Type of turn detection.""" + +class SemanticVad(TypedDict, total=False): + type: Required[Literal["semantic_vad"]] + """Type of turn detection, `semantic_vad` to turn on Semantic VAD.""" + + create_response: bool + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + interrupt_response: bool + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + +RealtimeAudioInputTurnDetectionParam: TypeAlias = Union[ServerVad, SemanticVad] diff --git a/src/openai/types/realtime/realtime_session_create_response.py b/src/openai/types/realtime/realtime_session_create_response.py index 7779f07a6e..8d7bfd6d8e 100644 --- a/src/openai/types/realtime/realtime_session_create_response.py +++ b/src/openai/types/realtime/realtime_session_create_response.py @@ -1,8 +1,9 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import Dict, List, Union, Optional -from typing_extensions import Literal, TypeAlias +from typing_extensions import Literal, Annotated, TypeAlias +from ..._utils import PropertyInfo from ..._models import BaseModel from .audio_transcription import AudioTranscription from .realtime_truncation import RealtimeTruncation @@ -21,6 +22,8 @@ "AudioInput", "AudioInputNoiseReduction", "AudioInputTurnDetection", + "AudioInputTurnDetectionServerVad", + "AudioInputTurnDetectionSemanticVad", "AudioOutput", "ToolChoice", "Tool", @@ -45,26 +48,30 @@ class AudioInputNoiseReduction(BaseModel): """ -class AudioInputTurnDetection(BaseModel): +class AudioInputTurnDetectionServerVad(BaseModel): + type: Literal["server_vad"] + """Type of turn detection, `server_vad` to turn on simple Server VAD.""" + create_response: Optional[bool] = None """ Whether or not to automatically generate a response when a VAD stop event occurs. """ - eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None - """Used only for `semantic_vad` mode. + idle_timeout_ms: Optional[int] = None + """Optional timeout after which a model response will be triggered automatically. - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, - 4s, and 2s respectively. - """ + This is useful for situations in which a long pause from the user is unexpected, + such as a phone call. The model will effectively prompt the user to continue the + conversation based on the current context. - idle_timeout_ms: Optional[int] = None - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received and emits a `timeout_triggered` event. + The timeout value will be applied after the last model response's audio has + finished playing, i.e. it's set to the `response.done` time plus audio playback + duration. + + An `input_audio_buffer.timeout_triggered` event (plus events associated with the + Response) will be emitted when the timeout is reached. Idle timeout is currently + only supported for `server_vad` mode. """ interrupt_response: Optional[bool] = None @@ -97,8 +104,38 @@ class AudioInputTurnDetection(BaseModel): perform better in noisy environments. """ - type: Optional[Literal["server_vad", "semantic_vad"]] = None - """Type of turn detection.""" + +class AudioInputTurnDetectionSemanticVad(BaseModel): + type: Literal["semantic_vad"] + """Type of turn detection, `semantic_vad` to turn on Semantic VAD.""" + + create_response: Optional[bool] = None + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + interrupt_response: Optional[bool] = None + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + +AudioInputTurnDetection: TypeAlias = Annotated[ + Union[AudioInputTurnDetectionServerVad, AudioInputTurnDetectionSemanticVad, None], + PropertyInfo(discriminator="type"), +] class AudioInput(BaseModel): @@ -130,8 +167,11 @@ class AudioInput(BaseModel): """Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. + trigger model response. + + Server VAD means that the model will detect the start and end of speech based on + audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input.py b/src/openai/types/realtime/realtime_transcription_session_audio_input.py index 0ae92959aa..efc321cbeb 100644 --- a/src/openai/types/realtime/realtime_transcription_session_audio_input.py +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input.py @@ -51,8 +51,11 @@ class RealtimeTranscriptionSessionAudioInput(BaseModel): """Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. + trigger model response. + + Server VAD means that the model will detect the start and end of speech based on + audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input_param.py b/src/openai/types/realtime/realtime_transcription_session_audio_input_param.py index a8263789dc..c9153b68a4 100644 --- a/src/openai/types/realtime/realtime_transcription_session_audio_input_param.py +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input_param.py @@ -2,6 +2,7 @@ from __future__ import annotations +from typing import Optional from typing_extensions import TypedDict from .noise_reduction_type import NoiseReductionType @@ -48,12 +49,15 @@ class RealtimeTranscriptionSessionAudioInputParam(TypedDict, total=False): transcription, these offer additional guidance to the transcription service. """ - turn_detection: RealtimeTranscriptionSessionAudioInputTurnDetectionParam + turn_detection: Optional[RealtimeTranscriptionSessionAudioInputTurnDetectionParam] """Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually - trigger model response. Server VAD means that the model will detect the start - and end of speech based on audio volume and respond at the end of user speech. + trigger model response. + + Server VAD means that the model will detect the start and end of speech based on + audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py index 0cac36f7a3..7dc7a8f302 100644 --- a/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection.py @@ -1,32 +1,38 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import Optional -from typing_extensions import Literal +from typing import Union, Optional +from typing_extensions import Literal, Annotated, TypeAlias +from ..._utils import PropertyInfo from ..._models import BaseModel -__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetection"] +__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetection", "ServerVad", "SemanticVad"] -class RealtimeTranscriptionSessionAudioInputTurnDetection(BaseModel): +class ServerVad(BaseModel): + type: Literal["server_vad"] + """Type of turn detection, `server_vad` to turn on simple Server VAD.""" + create_response: Optional[bool] = None """ Whether or not to automatically generate a response when a VAD stop event occurs. """ - eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None - """Used only for `semantic_vad` mode. + idle_timeout_ms: Optional[int] = None + """Optional timeout after which a model response will be triggered automatically. - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. - """ + This is useful for situations in which a long pause from the user is unexpected, + such as a phone call. The model will effectively prompt the user to continue the + conversation based on the current context. - idle_timeout_ms: Optional[int] = None - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received. + The timeout value will be applied after the last model response's audio has + finished playing, i.e. it's set to the `response.done` time plus audio playback + duration. + + An `input_audio_buffer.timeout_triggered` event (plus events associated with the + Response) will be emitted when the timeout is reached. Idle timeout is currently + only supported for `server_vad` mode. """ interrupt_response: Optional[bool] = None @@ -59,5 +65,34 @@ class RealtimeTranscriptionSessionAudioInputTurnDetection(BaseModel): perform better in noisy environments. """ - type: Optional[Literal["server_vad", "semantic_vad"]] = None - """Type of turn detection.""" + +class SemanticVad(BaseModel): + type: Literal["semantic_vad"] + """Type of turn detection, `semantic_vad` to turn on Semantic VAD.""" + + create_response: Optional[bool] = None + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Optional[Literal["low", "medium", "high", "auto"]] = None + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + interrupt_response: Optional[bool] = None + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + +RealtimeTranscriptionSessionAudioInputTurnDetection: TypeAlias = Annotated[ + Union[ServerVad, SemanticVad, None], PropertyInfo(discriminator="type") +] diff --git a/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py index e76dc9a8fe..d899b8c5c1 100644 --- a/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py +++ b/src/openai/types/realtime/realtime_transcription_session_audio_input_turn_detection_param.py @@ -2,31 +2,36 @@ from __future__ import annotations -from typing import Optional -from typing_extensions import Literal, TypedDict +from typing import Union, Optional +from typing_extensions import Literal, Required, TypeAlias, TypedDict -__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetectionParam"] +__all__ = ["RealtimeTranscriptionSessionAudioInputTurnDetectionParam", "ServerVad", "SemanticVad"] -class RealtimeTranscriptionSessionAudioInputTurnDetectionParam(TypedDict, total=False): +class ServerVad(TypedDict, total=False): + type: Required[Literal["server_vad"]] + """Type of turn detection, `server_vad` to turn on simple Server VAD.""" + create_response: bool """ Whether or not to automatically generate a response when a VAD stop event occurs. """ - eagerness: Literal["low", "medium", "high", "auto"] - """Used only for `semantic_vad` mode. + idle_timeout_ms: Optional[int] + """Optional timeout after which a model response will be triggered automatically. - The eagerness of the model to respond. `low` will wait longer for the user to - continue speaking, `high` will respond more quickly. `auto` is the default and - is equivalent to `medium`. - """ + This is useful for situations in which a long pause from the user is unexpected, + such as a phone call. The model will effectively prompt the user to continue the + conversation based on the current context. - idle_timeout_ms: Optional[int] - """ - Optional idle timeout after which turn detection will auto-timeout when no - additional audio is received. + The timeout value will be applied after the last model response's audio has + finished playing, i.e. it's set to the `response.done` time plus audio playback + duration. + + An `input_audio_buffer.timeout_triggered` event (plus events associated with the + Response) will be emitted when the timeout is reached. Idle timeout is currently + only supported for `server_vad` mode. """ interrupt_response: bool @@ -59,5 +64,32 @@ class RealtimeTranscriptionSessionAudioInputTurnDetectionParam(TypedDict, total= perform better in noisy environments. """ - type: Literal["server_vad", "semantic_vad"] - """Type of turn detection.""" + +class SemanticVad(TypedDict, total=False): + type: Required[Literal["semantic_vad"]] + """Type of turn detection, `semantic_vad` to turn on Semantic VAD.""" + + create_response: bool + """ + Whether or not to automatically generate a response when a VAD stop event + occurs. + """ + + eagerness: Literal["low", "medium", "high", "auto"] + """Used only for `semantic_vad` mode. + + The eagerness of the model to respond. `low` will wait longer for the user to + continue speaking, `high` will respond more quickly. `auto` is the default and + is equivalent to `medium`. `low`, `medium`, and `high` have max timeouts of 8s, + 4s, and 2s respectively. + """ + + interrupt_response: bool + """ + Whether or not to automatically interrupt any ongoing response with output to + the default conversation (i.e. `conversation` of `auto`) when a VAD start event + occurs. + """ + + +RealtimeTranscriptionSessionAudioInputTurnDetectionParam: TypeAlias = Union[ServerVad, SemanticVad] diff --git a/src/openai/types/responses/response.py b/src/openai/types/responses/response.py index 163648ef3e..423b6f20f1 100644 --- a/src/openai/types/responses/response.py +++ b/src/openai/types/responses/response.py @@ -252,10 +252,10 @@ class Response(BaseModel): truncation: Optional[Literal["auto", "disabled"]] = None """The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. """ diff --git a/src/openai/types/responses/response_create_params.py b/src/openai/types/responses/response_create_params.py index be687c0aff..af0d5e7483 100644 --- a/src/openai/types/responses/response_create_params.py +++ b/src/openai/types/responses/response_create_params.py @@ -252,10 +252,10 @@ class ResponseCreateParamsBase(TypedDict, total=False): truncation: Optional[Literal["auto", "disabled"]] """The truncation strategy to use for the model response. - - `auto`: If the context of this response and previous ones exceeds the model's - context window size, the model will truncate the response to fit the context - window by dropping input items in the middle of the conversation. - - `disabled` (default): If a model response will exceed the context window size + - `auto`: If the input to this Response exceeds the model's context window size, + the model will truncate the response to fit the context window by dropping + items from the beginning of the conversation. + - `disabled` (default): If the input size will exceed the context window size for a model, the request will fail with a 400 error. """ diff --git a/tests/api_resources/realtime/test_client_secrets.py b/tests/api_resources/realtime/test_client_secrets.py index b7bb0e5aa7..cd15b4be52 100644 --- a/tests/api_resources/realtime/test_client_secrets.py +++ b/tests/api_resources/realtime/test_client_secrets.py @@ -44,14 +44,13 @@ def test_method_create_with_all_params(self, client: OpenAI) -> None: "prompt": "prompt", }, "turn_detection": { + "type": "server_vad", "create_response": True, - "eagerness": "low", - "idle_timeout_ms": 0, + "idle_timeout_ms": 5000, "interrupt_response": True, "prefix_padding_ms": 0, "silence_duration_ms": 0, "threshold": 0, - "type": "server_vad", }, }, "output": { @@ -141,14 +140,13 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> "prompt": "prompt", }, "turn_detection": { + "type": "server_vad", "create_response": True, - "eagerness": "low", - "idle_timeout_ms": 0, + "idle_timeout_ms": 5000, "interrupt_response": True, "prefix_padding_ms": 0, "silence_duration_ms": 0, "threshold": 0, - "type": "server_vad", }, }, "output": { diff --git a/tests/test_client.py b/tests/test_client.py index e5300e55d7..3287e0e706 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -6,13 +6,10 @@ import os import sys import json -import time import asyncio import inspect -import subprocess import tracemalloc from typing import Any, Union, Protocol, cast -from textwrap import dedent from unittest import mock from typing_extensions import Literal @@ -23,6 +20,7 @@ from openai import OpenAI, AsyncOpenAI, APIResponseValidationError from openai._types import Omit +from openai._utils import asyncify from openai._models import BaseModel, FinalRequestOptions from openai._streaming import Stream, AsyncStream from openai._exceptions import OpenAIError, APIStatusError, APITimeoutError, APIResponseValidationError @@ -30,8 +28,10 @@ DEFAULT_TIMEOUT, HTTPX_DEFAULT_TIMEOUT, BaseClient, + OtherPlatform, DefaultHttpxClient, DefaultAsyncHttpxClient, + get_platform, make_request_options, ) @@ -1857,50 +1857,9 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: assert response.retries_taken == failures_before_success assert int(response.http_request.headers.get("x-stainless-retry-count")) == failures_before_success - def test_get_platform(self) -> None: - # A previous implementation of asyncify could leave threads unterminated when - # used with nest_asyncio. - # - # Since nest_asyncio.apply() is global and cannot be un-applied, this - # test is run in a separate process to avoid affecting other tests. - test_code = dedent(""" - import asyncio - import nest_asyncio - import threading - - from openai._utils import asyncify - from openai._base_client import get_platform - - async def test_main() -> None: - result = await asyncify(get_platform)() - print(result) - for thread in threading.enumerate(): - print(thread.name) - - nest_asyncio.apply() - asyncio.run(test_main()) - """) - with subprocess.Popen( - [sys.executable, "-c", test_code], - text=True, - ) as process: - timeout = 10 # seconds - - start_time = time.monotonic() - while True: - return_code = process.poll() - if return_code is not None: - if return_code != 0: - raise AssertionError("calling get_platform using asyncify resulted in a non-zero exit code") - - # success - break - - if time.monotonic() - start_time > timeout: - process.kill() - raise AssertionError("calling get_platform using asyncify resulted in a hung process") - - time.sleep(0.1) + async def test_get_platform(self) -> None: + platform = await asyncify(get_platform)() + assert isinstance(platform, (str, OtherPlatform)) async def test_proxy_environment_variables(self, monkeypatch: pytest.MonkeyPatch) -> None: # Test that the proxy environment variables are set correctly