From 7ca668d10af09d566e1d6fe1516dfa33096ae727 Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 10:08:55 +0300 Subject: [PATCH 01/12] initial commit Signed-off-by: Uri Shaket --- pyproject.toml | 2 + src/guidellm/backends/__init__.py | 4 + src/guidellm/backends/backend.py | 2 +- src/guidellm/backends/openai/__init__.py | 3 + src/guidellm/backends/openai/http.py | 58 ++------- src/guidellm/extras/audio.py | 104 +++++++++++++++++ tests/unit/backends/test_backend.py | 14 +++ .../schemas/generative/test_entrypoints.py | 18 +++ tests/unit/extras/test_audio.py | 75 +++++++++++- uv.lock | 110 +++++++++--------- 10 files changed, 286 insertions(+), 104 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 085f49489..a148dd54c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,8 @@ audio = [ # Torchcodec needs specific torch version "torch==2.10.*", "torchcodec==0.10.*", + # openai_realtime_ws backend (vLLM /v1/realtime) + "websockets>=13.0,<16.0", ] vision = [ "datasets[vision]", diff --git a/src/guidellm/backends/__init__.py b/src/guidellm/backends/__init__.py index 52ba6ecb3..d423daf54 100644 --- a/src/guidellm/backends/__init__.py +++ b/src/guidellm/backends/__init__.py @@ -16,6 +16,8 @@ AudioRequestHandler, ChatCompletionsRequestHandler, OpenAIHTTPBackend, + OpenAIRealtimeWebSocketBackend, + OpenAIRealtimeWsBackendArgs, OpenAIRequestHandler, OpenAIRequestHandlerFactory, TextCompletionsRequestHandler, @@ -35,6 +37,8 @@ "BackendType", "ChatCompletionsRequestHandler", "OpenAIHTTPBackend", + "OpenAIRealtimeWebSocketBackend", + "OpenAIRealtimeWsBackendArgs", "OpenAIRequestHandler", "OpenAIRequestHandlerFactory", "TextCompletionsRequestHandler", diff --git a/src/guidellm/backends/backend.py b/src/guidellm/backends/backend.py index 88c3617cb..690b0cd44 100644 --- a/src/guidellm/backends/backend.py +++ b/src/guidellm/backends/backend.py @@ -24,7 +24,7 @@ ] -BackendType = Literal["openai_http", "vllm_python"] +BackendType = Literal["openai_http", "openai_realtime_ws", "vllm_python"] class BackendArgs(BaseModel): diff --git a/src/guidellm/backends/openai/__init__.py b/src/guidellm/backends/openai/__init__.py index fd76e80df..da4f2bbb7 100644 --- a/src/guidellm/backends/openai/__init__.py +++ b/src/guidellm/backends/openai/__init__.py @@ -1,4 +1,5 @@ from .http import OpenAIHTTPBackend +from .realtime_ws import OpenAIRealtimeWebSocketBackend, OpenAIRealtimeWsBackendArgs from .request_handlers import ( AudioRequestHandler, ChatCompletionsRequestHandler, @@ -12,6 +13,8 @@ "AudioRequestHandler", "ChatCompletionsRequestHandler", "OpenAIHTTPBackend", + "OpenAIRealtimeWebSocketBackend", + "OpenAIRealtimeWsBackendArgs", "OpenAIRequestHandler", "OpenAIRequestHandlerFactory", "ResponsesRequestHandler", diff --git a/src/guidellm/backends/openai/http.py b/src/guidellm/backends/openai/http.py index 527bb53aa..ef83b4265 100644 --- a/src/guidellm/backends/openai/http.py +++ b/src/guidellm/backends/openai/http.py @@ -19,6 +19,11 @@ from pydantic import Field, field_validator from guidellm.backends.backend import Backend, BackendArgs +from guidellm.backends.openai.openai_common import ( + FALLBACK_TIMEOUT, + build_openai_headers, + resolve_openai_validate_kwargs, +) from guidellm.backends.openai.request_handlers import OpenAIRequestHandlerFactory from guidellm.schemas import ( GenerationRequest, @@ -123,10 +128,6 @@ def validate_request_format(cls, v: str | None) -> str | None: "audio_translations": "/v1/audio/translations", } -# NOTE: This value is taken from httpx's default -FALLBACK_TIMEOUT = 5.0 - - @Backend.register("openai_http") class OpenAIHTTPBackend(Backend): """ @@ -501,52 +502,11 @@ async def _aiter_lines(self, stream: httpx.Response) -> AsyncIterator[str]: def _build_headers( self, existing_headers: dict[str, str] | None = None ) -> dict[str, str] | None: - """ - Build headers dictionary with bearer token authentication. - - Merges the Authorization bearer token header (if api_key is set) with any - existing headers. User-provided headers take precedence over the bearer token. - - :param existing_headers: Optional existing headers to merge with - :return: Dictionary of headers with bearer token included if api_key is set - """ - headers: dict[str, str] = {} - - # Add bearer token if api_key is set - if self.api_key: - headers["Authorization"] = f"Bearer {self.api_key}" - - # Merge with existing headers (user headers take precedence) - if existing_headers: - headers = {**headers, **existing_headers} - - return headers or None + return build_openai_headers(self.api_key, existing_headers) def _resolve_validate_kwargs( self, validate_backend: bool | str | dict[str, Any] ) -> dict[str, Any] | None: - if not (validate_kwargs := validate_backend): - return None - - if validate_kwargs is True: - validate_kwargs = "/health" - - if isinstance(validate_kwargs, str) and validate_kwargs in self.api_routes: - validate_kwargs = f"{self.target}/{self.api_routes[validate_kwargs]}" - - if isinstance(validate_kwargs, str): - validate_kwargs = { - "method": "GET", - "url": validate_kwargs, - } - - if not isinstance(validate_kwargs, dict) or "url" not in validate_kwargs: - raise ValueError( - "validate_backend must be a boolean, string, or dictionary and contain " - f"a target URL. Got: {validate_kwargs}" - ) - - if "method" not in validate_kwargs: - validate_kwargs["method"] = "GET" - - return validate_kwargs + return resolve_openai_validate_kwargs( + validate_backend, self.target, self.api_routes + ) diff --git a/src/guidellm/extras/audio.py b/src/guidellm/extras/audio.py index fe05f2275..e8bcd6cde 100644 --- a/src/guidellm/extras/audio.py +++ b/src/guidellm/extras/audio.py @@ -1,5 +1,6 @@ from __future__ import annotations +import base64 from pathlib import Path from typing import Any, Literal @@ -17,6 +18,7 @@ __all__ = [ "encode_audio", "is_url", + "pcm16_append_b64_chunks", ] @@ -212,3 +214,105 @@ def _encode_audio( def get_file_name(path: Path | str) -> str: """Get file name from path.""" return Path(path).name + + +# Decoded float waveforms are nominally in [-1.0, 1.0]; clip before scaling to int16. +_PCM16_WAVE_CLIP_MIN = -1.0 +_PCM16_WAVE_CLIP_MAX = 1.0 +# Symmetric int16 positive peak (2**15 - 1); standard float[-1, 1] -> PCM16 mapping. +_PCM16_FLOAT_TO_INT16_SCALE = 32767.0 +_BYTES_PER_PCM16_SAMPLE = 2 + + +def _sample_rate_hint_from_audio_column_dict(d: dict[str, Any]) -> int | None: + """Return ``sample_rate`` / ``sampling_rate`` from an audio column dict.""" + hint = d.get("sample_rate", d.get("sampling_rate")) + if ( + hint is not None + and not isinstance(hint, bool) + and isinstance(hint, int | float) + and hint > 0 + ): + return int(round(float(hint))) + return None + + +def _require_positive_sample_rate(sr_raw: Any) -> float: + if isinstance(sr_raw, bool) or not isinstance(sr_raw, int | float) or sr_raw <= 0: + raise ValueError( + "Decoded audio has invalid sample_rate " + f"{sr_raw!r}; expected a positive number" + ) + return float(sr_raw) + + +def pcm16_append_b64_chunks( + audio_item: dict[str, Any] | bytes, + *, + target_sample_rate: int = 16000, + chunk_samples: int = 3200, +) -> list[str]: + """ + Decode audio to base64-encoded PCM16 mono chunks for realtime ``append`` events. + + Matches vLLM ``input_audio_buffer.append`` (PCM16 mono at ``target_sample_rate`` + Hz), split into ``chunk_samples``-frame segments. + Equivalent conversion flow to vLLM's realtime microphone client example, but + generalized for dataset/file inputs used by GuideLLM benchmarks. + """ + # Accept common audio column shapes used in GuideLLM datasets. + if isinstance(audio_item, dict): + if "audio" in audio_item: + decode_sr = _sample_rate_hint_from_audio_column_dict(audio_item) + samples = _decode_audio( + audio_item["audio"], + sample_rate=decode_sr, + ) + elif "data" in audio_item or "url" in audio_item: + samples = _decode_audio(audio_item) + else: + raise ValueError( + "audio_column dict must include 'audio', 'data', or 'url' " + "(same shapes as encode_audio / _decode_audio); " + f"got keys {list(audio_item)!r}" + ) + else: + samples = _decode_audio(audio_item) + + # Ensure channel-first shape, then downmix to mono for realtime PCM input. + data = samples.data + if data.dim() == 1: + data = data.unsqueeze(0) + if data.shape[0] > 1: + data = data.mean(dim=0, keepdim=True) + + # Realtime endpoint expects 16 kHz PCM16 mono. + sr = _require_positive_sample_rate(samples.sample_rate) + if sr != target_sample_rate: + t_in = data.shape[1] + t_out = max(1, int(round(t_in * target_sample_rate / sr))) + data = torch.nn.functional.interpolate( + data.unsqueeze(0), + size=t_out, + mode="linear", + align_corners=False, + ).squeeze(0) + + # Convert float waveform to signed little-endian PCM16 bytes. + wave = data.squeeze(0) + pcm_i16 = ( + wave.clamp(_PCM16_WAVE_CLIP_MIN, _PCM16_WAVE_CLIP_MAX) + * _PCM16_FLOAT_TO_INT16_SCALE + ).round().to(torch.int16) + buf = pcm_i16.cpu().numpy().tobytes() + + # Split PCM bytes into chunk-sized base64 payloads for append events. + chunk_bytes = max(1, chunk_samples) * _BYTES_PER_PCM16_SAMPLE + out: list[str] = [] + for i in range(0, len(buf), chunk_bytes): + pcm_chunk = buf[i : i + chunk_bytes] + if pcm_chunk: + out.append(base64.b64encode(pcm_chunk).decode("ascii")) + if not out: + raise ValueError("Decoded audio produced no PCM data") + return out diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py index 1cae4952a..4dbb76ac2 100644 --- a/tests/unit/backends/test_backend.py +++ b/tests/unit/backends/test_backend.py @@ -318,6 +318,20 @@ async def default_model(self) -> str: assert backend.type_ == "mock_backend" @pytest.mark.smoke + def test_openai_realtime_ws_backend_registered(self): + """Realtime WebSocket backend is registered and constructible.""" + from guidellm.backends.openai import ( + OpenAIRealtimeWebSocketBackend, + OpenAIRealtimeWsBackendArgs, + ) + + assert Backend.is_registered("openai_realtime_ws") + realtime_args = Backend.get_backend_args("openai_realtime_ws") + assert realtime_args is OpenAIRealtimeWsBackendArgs + backend = Backend.create("openai_realtime_ws", target="http://localhost:9000") + assert isinstance(backend, OpenAIRealtimeWebSocketBackend) + assert backend.type_ == "openai_realtime_ws" + def test_openai_backend_registered(self): """Test that OpenAI HTTP backend is registered.""" from guidellm.backends.openai import OpenAIHTTPBackend diff --git a/tests/unit/benchmark/schemas/generative/test_entrypoints.py b/tests/unit/benchmark/schemas/generative/test_entrypoints.py index 2e0e1623b..f3758ff1e 100644 --- a/tests/unit/benchmark/schemas/generative/test_entrypoints.py +++ b/tests/unit/benchmark/schemas/generative/test_entrypoints.py @@ -12,6 +12,7 @@ from guidellm.backends.backend import BackendArgs from guidellm.backends.openai.http import OpenAIHttpBackendArgs +from guidellm.backends.openai.realtime_ws import OpenAIRealtimeWsBackendArgs from guidellm.benchmark.schemas.generative.entrypoints import ( BenchmarkGenerativeTextArgs, ) @@ -52,6 +53,23 @@ def test_dict_backend_kwargs_transformed(self): assert args.backend_kwargs.target == "http://localhost:9000" assert args.backend_kwargs.model == "test_model" + def test_openai_realtime_ws_backend_kwargs_validates(self) -> None: + """Realtime WS backend is selected explicitly; no request_format shim.""" + args = BenchmarkGenerativeTextArgs.model_validate( + { + "backend": "openai_realtime_ws", + "backend_kwargs": { + "target": "http://localhost:8000", + "model": "rt-model", + }, + "data": ["prompt_tokens=256,output_tokens=128"], + } + ) + assert args.backend == "openai_realtime_ws" + assert isinstance(args.backend_kwargs, OpenAIRealtimeWsBackendArgs) + assert args.backend_kwargs.target == "http://localhost:8000" + assert args.backend_kwargs.model == "rt-model" + def test_dict_with_request_format(self): """ Test that request_format is included in BackendArgs transformation. diff --git a/tests/unit/extras/test_audio.py b/tests/unit/extras/test_audio.py index b7f783693..c890a21cb 100644 --- a/tests/unit/extras/test_audio.py +++ b/tests/unit/extras/test_audio.py @@ -7,7 +7,7 @@ import pytest import torch -from guidellm.extras.audio import encode_audio +from guidellm.extras.audio import encode_audio, pcm16_append_b64_chunks @pytest.fixture @@ -194,3 +194,76 @@ def test_end_to_end_audio_processing(sample_audio_tensor): assert result["format"] == "mp3" assert result["audio_samples"] == 16000 assert result["audio_seconds"] == min(original_duration, 0.5) + + +@patch("guidellm.extras.audio._decode_audio") +def test_pcm16_append_b64_chunks_rejects_unknown_dict_keys(mock_decode): + mock_decode.side_effect = AssertionError("_decode_audio should not run") + with pytest.raises(ValueError, match="audio_column dict"): + pcm16_append_b64_chunks({"foo": 1}) + + +@patch("guidellm.extras.audio._decode_audio") +def test_pcm16_append_b64_chunks_splits_into_multiple_base64_chunks(mock_decode): + mock_decode.return_value = MagicMock() + mock_decode.return_value.data = torch.zeros(1, 5000) + mock_decode.return_value.sample_rate = 16000 + + out = pcm16_append_b64_chunks({"audio": b"x"}, chunk_samples=3200) + + assert len(out) == 2 + assert all(isinstance(chunk_b64, str) for chunk_b64 in out) + + +@patch("guidellm.extras.audio._decode_audio") +def test_pcm16_append_b64_chunks_empty_wave_raises(mock_decode): + mock_decode.return_value = MagicMock() + mock_decode.return_value.data = torch.zeros(1, 0) + mock_decode.return_value.sample_rate = 16000 + + with pytest.raises(ValueError, match="no PCM"): + pcm16_append_b64_chunks({"audio": b"x"}) + + +@patch("guidellm.extras.audio._decode_audio") +def test_pcm16_append_b64_chunks_downmixes_stereo(mock_decode): + mock_decode.return_value = MagicMock() + mock_decode.return_value.data = torch.randn(2, 200) + mock_decode.return_value.sample_rate = 16000 + + out = pcm16_append_b64_chunks({"audio": b"x"}, chunk_samples=100) + + assert len(out) >= 1 + + +@patch("guidellm.extras.audio._decode_audio") +def test_pcm16_append_b64_chunks_audio_dict_passes_outer_sample_rate(mock_decode): + mock_decode.return_value = MagicMock() + mock_decode.return_value.data = torch.zeros(1, 100) + mock_decode.return_value.sample_rate = 16000 + + pcm16_append_b64_chunks({"audio": b"x", "sample_rate": 8000}) + + mock_decode.assert_called_once() + assert mock_decode.call_args.kwargs.get("sample_rate") == 8000 + + +@patch("guidellm.extras.audio._decode_audio") +def test_pcm16_append_b64_chunks_sampling_rate_alias(mock_decode): + mock_decode.return_value = MagicMock() + mock_decode.return_value.data = torch.zeros(1, 50) + mock_decode.return_value.sample_rate = 16000 + + pcm16_append_b64_chunks({"audio": b"x", "sampling_rate": 44100}) + + assert mock_decode.call_args.kwargs.get("sample_rate") == 44100 + + +@patch("guidellm.extras.audio._decode_audio") +def test_pcm16_append_b64_chunks_invalid_decoder_sample_rate_raises(mock_decode): + mock_decode.return_value = MagicMock() + mock_decode.return_value.data = torch.zeros(1, 10) + mock_decode.return_value.sample_rate = 0 + + with pytest.raises(ValueError, match="invalid sample_rate"): + pcm16_append_b64_chunks({"audio": b"x"}) diff --git a/uv.lock b/uv.lock index c09f6e994..bc018d7fd 100644 --- a/uv.lock +++ b/uv.lock @@ -841,12 +841,14 @@ all = [ { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin'" }, { name = "torchcodec" }, { name = "uvloop" }, + { name = "websockets" }, ] audio = [ { name = "datasets", extra = ["audio"] }, { name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" }, { name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin'" }, { name = "torchcodec" }, + { name = "websockets" }, ] dev = [ { name = "blobfile" }, @@ -889,6 +891,7 @@ dev = [ { name = "types-requests" }, { name = "types-toml" }, { name = "uvloop" }, + { name = "websockets" }, ] perf = [ { name = "msgpack" }, @@ -985,6 +988,7 @@ requires-dist = [ { name = "types-toml", marker = "extra == 'dev'" }, { name = "uvloop", specifier = ">=0.18" }, { name = "uvloop", marker = "extra == 'perf'" }, + { name = "websockets", marker = "extra == 'audio'", specifier = ">=13.0,<16.0" }, ] provides-extras = ["all", "recommended", "perf", "tokenizers", "audio", "vision", "dev"] @@ -3932,21 +3936,21 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform == 'darwin'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:4db72a4d257c45c3502f11764ee41460a87312fdc3dff47a8957812efe961725" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:0826ac8e409551e12b2360ac18b4161a838cbd111933e694752f351191331d09" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:7fbbf409143a4fe0812a40c0b46a436030a7e1d14fe8c5234dfbe44df47f617e" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:b39cafff7229699f9d6e172cac74d85fd71b568268e439e08d9c540e54732a3e" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:7417ef370d7c3969dd509dae8d5c7daeb945af335ab76dd38358ba30a91251c1" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:90821a3194b8806d9fa9fdaa9308c1bc73df0c26808274b14129a97c99f35794" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:358bd7125cbec6e692d60618a5eec7f55a51b29e3652a849fd42af021d818023" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:470de4176007c2700735e003a830828a88d27129032a3add07291da07e2a94e8" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2d16abfce6c92584ceeb00c3b2665d5798424dd9ed235ea69b72e045cd53ae97" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:4584ab167995c0479f6821e3dceaf199c8166c811d3adbba5d8eedbbfa6764fd" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:45a1c5057629444aeb1c452c18298fa7f30f2f7aeadd4dc41f9d340980294407" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:339e05502b6c839db40e88720cb700f5a3b50cda332284873e851772d41b2c1e" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:840351da59cedb7bcbc51981880050813c19ef6b898a7fecf73a3afc71aff3fe" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:c88b1129fd4e14f0f882963c6728315caae35d2f47374d17edeed1edc7697497" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f4bea7dc451267c028593751612ad559299589304e68df54ae7672427893ff2c" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:4db72a4d257c45c3502f11764ee41460a87312fdc3dff47a8957812efe961725", upload-time = "2026-02-06T16:27:14Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:0826ac8e409551e12b2360ac18b4161a838cbd111933e694752f351191331d09", upload-time = "2026-02-06T16:27:14Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:7fbbf409143a4fe0812a40c0b46a436030a7e1d14fe8c5234dfbe44df47f617e", upload-time = "2026-02-06T16:27:14Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:b39cafff7229699f9d6e172cac74d85fd71b568268e439e08d9c540e54732a3e", upload-time = "2026-02-06T16:27:17Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:7417ef370d7c3969dd509dae8d5c7daeb945af335ab76dd38358ba30a91251c1", upload-time = "2026-02-10T19:55:42Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:90821a3194b8806d9fa9fdaa9308c1bc73df0c26808274b14129a97c99f35794", upload-time = "2026-02-10T19:55:42Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:358bd7125cbec6e692d60618a5eec7f55a51b29e3652a849fd42af021d818023", upload-time = "2026-02-10T19:55:42Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:470de4176007c2700735e003a830828a88d27129032a3add07291da07e2a94e8", upload-time = "2026-02-10T19:55:43Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2d16abfce6c92584ceeb00c3b2665d5798424dd9ed235ea69b72e045cd53ae97", upload-time = "2026-01-23T15:09:55Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:4584ab167995c0479f6821e3dceaf199c8166c811d3adbba5d8eedbbfa6764fd", upload-time = "2026-01-23T15:09:55Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:45a1c5057629444aeb1c452c18298fa7f30f2f7aeadd4dc41f9d340980294407", upload-time = "2026-01-23T15:09:55Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:339e05502b6c839db40e88720cb700f5a3b50cda332284873e851772d41b2c1e", upload-time = "2026-01-23T15:09:57Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:840351da59cedb7bcbc51981880050813c19ef6b898a7fecf73a3afc71aff3fe", upload-time = "2026-01-23T15:09:59Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:c88b1129fd4e14f0f882963c6728315caae35d2f47374d17edeed1edc7697497", upload-time = "2026-01-23T15:09:59Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f4bea7dc451267c028593751612ad559299589304e68df54ae7672427893ff2c", upload-time = "2026-01-23T15:10:01Z" }, ] [[package]] @@ -3969,44 +3973,44 @@ dependencies = [ { name = "typing-extensions", marker = "sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-linux_aarch64.whl", hash = "sha256:31ae44836c8b9bbd1a3943d29c7c7457709ddf7c6173aa34aefe9d2203e4c405" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-linux_s390x.whl", hash = "sha256:beadc2a6a1785b09a46daad378de91ef274b8d3eea7af0bc2d017d97f115afdf" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d63ee6a80982fd73fe44bb70d97d2976e010312ff6db81d7bfb9167b06dd45b9" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a280ffaea7b9c828e0c1b9b3bd502d9b6a649dc9416997b69b84544bd469f215" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:6c6f0df770144907092a0d067048d96ed4f278a6c840376d2ff0e27e7579b925" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_aarch64.whl", hash = "sha256:ce5c113d1f55f8c1f5af05047a24e50d11d293e0cbbb5bf7a75c6c761edd6eaa" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_s390x.whl", hash = "sha256:0e286fcf6ce0cc7b204396c9b4ea0d375f1f0c3e752f68ce3d3aeb265511db8c" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1cfcb9b1558c6e52dffd0d4effce83b13c5ae5d97338164c372048c21f9cfccb" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b7cb1ec66cefb90fd7b676eac72cfda3b8d4e4d0cacd7a531963bc2e0a9710ab" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:17a09465bab2aab8f0f273410297133d8d8fb6dd84dccbd252ca4a4f3a111847" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-win_arm64.whl", hash = "sha256:c35c0de592941d4944698dbfa87271ab85d3370eca3b694943a2ab307ac34b3f" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_aarch64.whl", hash = "sha256:8de5a36371b775e2d4881ed12cc7f2de400b1ad3d728aa74a281f649f87c9b8c" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_s390x.whl", hash = "sha256:9accc30b56cb6756d4a9d04fcb8ebc0bb68c7d55c1ed31a8657397d316d31596" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:179451716487f8cb09b56459667fa1f5c4c0946c1e75fbeae77cfc40a5768d87" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ee40b8a4b4b2cf0670c6fd4f35a7ef23871af956fecb238fbf5da15a72650b1d" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:21cb5436978ef47c823b7a813ff0f8c2892e266cfe0f1d944879b5fba81bf4e1" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-win_arm64.whl", hash = "sha256:3eaa727e6a73affa61564d86b9d03191df45c8650d0666bd3d57c8597ef61e78" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_aarch64.whl", hash = "sha256:fd215f3d0f681905c5b56b0630a3d666900a37fcc3ca5b937f95275c66f9fd9c" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_s390x.whl", hash = "sha256:170a0623108055be5199370335cf9b41ba6875b3cb6f086db4aee583331a4899" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e51994492cdb76edce29da88de3672a3022f9ef0ffd90345436948d4992be2c7" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8d316e5bf121f1eab1147e49ad0511a9d92e4c45cc357d1ab0bee440da71a095" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:b719da5af01b59126ac13eefd6ba3dd12d002dc0e8e79b8b365e55267a8189d3" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-win_arm64.whl", hash = "sha256:b67d91326e4ed9eccbd6b7d84ed7ffa43f93103aa3f0b24145f3001f3b11b714" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_aarch64.whl", hash = "sha256:5af75e5f49de21b0bdf7672bc27139bd285f9e8dbcabe2d617a2eb656514ac36" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_s390x.whl", hash = "sha256:ba51ef01a510baf8fff576174f702c47e1aa54389a9f1fba323bb1a5003ff0bf" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:0fedcb1a77e8f2aaf7bfd21591bf6d1e0b207473268c9be16b17cb7783253969" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:106dd1930cb30a4a337366ba3f9b25318ebf940f51fd46f789281dd9e736bdc4" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:eb1bde1ce198f05c8770017de27e001d404499cf552aaaa014569eff56ca25c0" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_aarch64.whl", hash = "sha256:ea2bcc9d1fca66974a71d4bf9a502539283f35d61fcab5a799b4e120846f1e02" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_s390x.whl", hash = "sha256:f8294fd2fc6dd8f4435a891a0122307a043b14b21f0dac1bca63c85bfb59e586" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:a28fdbcfa2fbacffec81300f24dd1bed2b0ccfdbed107a823cff12bc1db070f6" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:aada8afc068add586464b2a55adb7cc9091eec55caf5320447204741cb6a0604" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-win_amd64.whl", hash = "sha256:2adc71fe471e98a608723bfc837f7e1929885ebb912c693597711e139c1cda41" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_aarch64.whl", hash = "sha256:9412bd37b70f5ebd1205242c4ba4cabae35a605947f2b30806d5c9b467936db9" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_s390x.whl", hash = "sha256:e71c476517c33e7db69825a9ff46c7f47a723ec4dac5b2481cff4246d1c632be" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:23882f8d882460aca809882fc42f5e343bf07585274f929ced00177d1be1eb67" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:4fcd8b4cc2ae20f2b7749fb275349c55432393868778c2d50a08e81d5ee5591e" }, - { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-win_amd64.whl", hash = "sha256:ffc8da9a1341092d6a90cb5b1c1a33cd61abf0fb43f0cd88443c27fa372c26ae" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-linux_aarch64.whl", hash = "sha256:31ae44836c8b9bbd1a3943d29c7c7457709ddf7c6173aa34aefe9d2203e4c405", upload-time = "2026-01-23T15:10:02Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-linux_s390x.whl", hash = "sha256:beadc2a6a1785b09a46daad378de91ef274b8d3eea7af0bc2d017d97f115afdf", upload-time = "2026-01-23T15:10:03Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d63ee6a80982fd73fe44bb70d97d2976e010312ff6db81d7bfb9167b06dd45b9", upload-time = "2026-01-23T15:10:05Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a280ffaea7b9c828e0c1b9b3bd502d9b6a649dc9416997b69b84544bd469f215", upload-time = "2026-01-23T15:10:07Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:6c6f0df770144907092a0d067048d96ed4f278a6c840376d2ff0e27e7579b925", upload-time = "2026-01-23T15:10:09Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_aarch64.whl", hash = "sha256:ce5c113d1f55f8c1f5af05047a24e50d11d293e0cbbb5bf7a75c6c761edd6eaa", upload-time = "2026-01-23T15:10:11Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-linux_s390x.whl", hash = "sha256:0e286fcf6ce0cc7b204396c9b4ea0d375f1f0c3e752f68ce3d3aeb265511db8c", upload-time = "2026-01-23T15:10:12Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1cfcb9b1558c6e52dffd0d4effce83b13c5ae5d97338164c372048c21f9cfccb", upload-time = "2026-01-23T15:10:15Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b7cb1ec66cefb90fd7b676eac72cfda3b8d4e4d0cacd7a531963bc2e0a9710ab", upload-time = "2026-01-23T15:10:15Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:17a09465bab2aab8f0f273410297133d8d8fb6dd84dccbd252ca4a4f3a111847", upload-time = "2026-01-23T15:10:19Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp311-cp311-win_arm64.whl", hash = "sha256:c35c0de592941d4944698dbfa87271ab85d3370eca3b694943a2ab307ac34b3f", upload-time = "2026-01-23T15:10:20Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_aarch64.whl", hash = "sha256:8de5a36371b775e2d4881ed12cc7f2de400b1ad3d728aa74a281f649f87c9b8c", upload-time = "2026-01-23T15:10:22Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-linux_s390x.whl", hash = "sha256:9accc30b56cb6756d4a9d04fcb8ebc0bb68c7d55c1ed31a8657397d316d31596", upload-time = "2026-01-23T15:10:24Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:179451716487f8cb09b56459667fa1f5c4c0946c1e75fbeae77cfc40a5768d87", upload-time = "2026-01-23T15:10:25Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ee40b8a4b4b2cf0670c6fd4f35a7ef23871af956fecb238fbf5da15a72650b1d", upload-time = "2026-01-23T15:10:27Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:21cb5436978ef47c823b7a813ff0f8c2892e266cfe0f1d944879b5fba81bf4e1", upload-time = "2026-01-23T15:10:30Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-win_arm64.whl", hash = "sha256:3eaa727e6a73affa61564d86b9d03191df45c8650d0666bd3d57c8597ef61e78", upload-time = "2026-01-23T15:10:31Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_aarch64.whl", hash = "sha256:fd215f3d0f681905c5b56b0630a3d666900a37fcc3ca5b937f95275c66f9fd9c", upload-time = "2026-01-23T15:10:34Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-linux_s390x.whl", hash = "sha256:170a0623108055be5199370335cf9b41ba6875b3cb6f086db4aee583331a4899", upload-time = "2026-01-23T15:10:35Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e51994492cdb76edce29da88de3672a3022f9ef0ffd90345436948d4992be2c7", upload-time = "2026-01-23T15:10:37Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8d316e5bf121f1eab1147e49ad0511a9d92e4c45cc357d1ab0bee440da71a095", upload-time = "2026-01-23T15:10:38Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:b719da5af01b59126ac13eefd6ba3dd12d002dc0e8e79b8b365e55267a8189d3", upload-time = "2026-01-23T15:10:41Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313-win_arm64.whl", hash = "sha256:b67d91326e4ed9eccbd6b7d84ed7ffa43f93103aa3f0b24145f3001f3b11b714", upload-time = "2026-01-23T15:10:42Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_aarch64.whl", hash = "sha256:5af75e5f49de21b0bdf7672bc27139bd285f9e8dbcabe2d617a2eb656514ac36", upload-time = "2026-01-23T15:10:44Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-linux_s390x.whl", hash = "sha256:ba51ef01a510baf8fff576174f702c47e1aa54389a9f1fba323bb1a5003ff0bf", upload-time = "2026-01-23T15:10:48Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:0fedcb1a77e8f2aaf7bfd21591bf6d1e0b207473268c9be16b17cb7783253969", upload-time = "2026-01-23T15:10:48Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:106dd1930cb30a4a337366ba3f9b25318ebf940f51fd46f789281dd9e736bdc4", upload-time = "2026-01-23T15:10:50Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:eb1bde1ce198f05c8770017de27e001d404499cf552aaaa014569eff56ca25c0", upload-time = "2026-01-23T15:10:50Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_aarch64.whl", hash = "sha256:ea2bcc9d1fca66974a71d4bf9a502539283f35d61fcab5a799b4e120846f1e02", upload-time = "2026-01-23T15:10:53Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-linux_s390x.whl", hash = "sha256:f8294fd2fc6dd8f4435a891a0122307a043b14b21f0dac1bca63c85bfb59e586", upload-time = "2026-01-23T15:10:55Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:a28fdbcfa2fbacffec81300f24dd1bed2b0ccfdbed107a823cff12bc1db070f6", upload-time = "2026-01-23T15:10:56Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:aada8afc068add586464b2a55adb7cc9091eec55caf5320447204741cb6a0604", upload-time = "2026-01-23T15:10:58Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314-win_amd64.whl", hash = "sha256:2adc71fe471e98a608723bfc837f7e1929885ebb912c693597711e139c1cda41", upload-time = "2026-01-23T15:11:01Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_aarch64.whl", hash = "sha256:9412bd37b70f5ebd1205242c4ba4cabae35a605947f2b30806d5c9b467936db9", upload-time = "2026-01-23T15:11:03Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-linux_s390x.whl", hash = "sha256:e71c476517c33e7db69825a9ff46c7f47a723ec4dac5b2481cff4246d1c632be", upload-time = "2026-01-23T15:11:04Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:23882f8d882460aca809882fc42f5e343bf07585274f929ced00177d1be1eb67", upload-time = "2026-01-23T15:11:07Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:4fcd8b4cc2ae20f2b7749fb275349c55432393868778c2d50a08e81d5ee5591e", upload-time = "2026-01-23T15:11:07Z" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp314-cp314t-win_amd64.whl", hash = "sha256:ffc8da9a1341092d6a90cb5b1c1a33cd61abf0fb43f0cd88443c27fa372c26ae", upload-time = "2026-01-23T15:11:10Z" }, ] [[package]] From ac586d23e6d3590aecac4a1aed88fa20ec3c592a Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 10:21:58 +0300 Subject: [PATCH 02/12] missing files Signed-off-by: Uri Shaket --- src/guidellm/backends/openai/openai_common.py | 68 ++ src/guidellm/backends/openai/realtime_ws.py | 576 ++++++++++++++ tests/e2e/test_realtime_ws_e2e.py | 144 ++++ .../unit/backends/openai/test_realtime_ws.py | 735 ++++++++++++++++++ tests/unit/backends/test_backend.py | 5 +- .../schemas/generative/test_entrypoints.py | 5 +- tests/unit/extras/test_audio.py | 7 + 7 files changed, 1538 insertions(+), 2 deletions(-) create mode 100644 src/guidellm/backends/openai/openai_common.py create mode 100644 src/guidellm/backends/openai/realtime_ws.py create mode 100644 tests/e2e/test_realtime_ws_e2e.py create mode 100644 tests/unit/backends/openai/test_realtime_ws.py diff --git a/src/guidellm/backends/openai/openai_common.py b/src/guidellm/backends/openai/openai_common.py new file mode 100644 index 000000000..c35d3d6bd --- /dev/null +++ b/src/guidellm/backends/openai/openai_common.py @@ -0,0 +1,68 @@ +"""Shared helpers for OpenAI-compatible HTTP and WebSocket backends.""" + +from __future__ import annotations + +from typing import Any + +# NOTE: Matches httpx's default connect timeout; shared by HTTP and WebSocket backends. +FALLBACK_TIMEOUT = 5.0 + + +def build_openai_headers( + api_key: str | None, + existing_headers: dict[str, str] | None = None, +) -> dict[str, str] | None: + """ + Build headers with bearer authentication for OpenAI-compatible requests. + + Merges the Authorization bearer token (if ``api_key`` is set) with any + existing headers. User-provided headers take precedence over the bearer token. + + :param api_key: Optional API key for Bearer authentication + :param existing_headers: Optional headers to merge in + :return: Headers dict, or ``None`` if there are no headers to send + """ + headers: dict[str, str] = {} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + if existing_headers: + headers = {**headers, **existing_headers} + return headers or None + + +def resolve_openai_validate_kwargs( + validate_backend: bool | str | dict[str, Any], + target: str, + api_routes: dict[str, str], +) -> dict[str, Any] | None: + """ + Normalize ``validate_backend`` into kwargs for ``httpx`` request(). + + Accepts the same shapes as + :class:`~guidellm.backends.openai.http.OpenAIHTTPBackend`. + """ + raw = validate_backend + if not raw: + return None + + if raw is True: + raw = "/health" + + if isinstance(raw, str): + url = f"{target}/{api_routes[raw]}" if raw in api_routes else raw + request_kwargs: dict[str, Any] = {"method": "GET", "url": url} + elif isinstance(raw, dict): + request_kwargs = raw + else: + request_kwargs = raw + + if not isinstance(request_kwargs, dict) or "url" not in request_kwargs: + raise ValueError( + "validate_backend must be a boolean, string, or dictionary and contain " + f"a target URL. Got: {request_kwargs}" + ) + + if "method" not in request_kwargs: + request_kwargs["method"] = "GET" + + return request_kwargs diff --git a/src/guidellm/backends/openai/realtime_ws.py b/src/guidellm/backends/openai/realtime_ws.py new file mode 100644 index 000000000..a84f0d5a9 --- /dev/null +++ b/src/guidellm/backends/openai/realtime_ws.py @@ -0,0 +1,576 @@ +""" +WebSocket backend for vLLM-compatible realtime audio transcription. + +Implements the JSON event protocol used by vLLM's ``/v1/realtime`` endpoint: +``session.created`` → ``session.update`` → ``input_audio_buffer.append`` → +``input_audio_buffer.commit`` (``final: false`` starts transcription, then +``final: true`` ends the audio stream) → ``transcription.delta`` / +``transcription.done``. +""" + +from __future__ import annotations + +import asyncio +import json +import ssl +import time +from collections.abc import AsyncIterator +from typing import TYPE_CHECKING, Any +from urllib.parse import ParseResult, urlparse + +import httpx +from pydantic import Field + +if TYPE_CHECKING: + from websockets.asyncio.client import ClientConnection + +from guidellm.backends.backend import Backend, BackendArgs +from guidellm.backends.openai.openai_common import ( + FALLBACK_TIMEOUT, + build_openai_headers, + resolve_openai_validate_kwargs, +) +from guidellm.backends.openai.request_handlers import AudioRequestHandler +from guidellm.schemas import ( + GenerationRequest, + GenerationRequestArguments, + GenerationResponse, + RequestInfo, +) + +__all__ = [ + "OpenAIRealtimeWebSocketBackend", + "OpenAIRealtimeWsBackendArgs", +] + +_WS_API_ROUTES = { + "/health": "health", + "/v1/models": "v1/models", +} + +# Guard against a misbehaving server that only emits ignored event types. +_MAX_IGNORED_WS_EVENT_TYPES = 50_000 + +# Per-message WebSocket recv timeout default so benchmark workers do not hang forever +# on a silent peer. Pass ``timeout=None`` to wait indefinitely. +_DEFAULT_WS_RECV_TIMEOUT = 120.0 + +_AUDIO_EXTRA_HINT = ( + "Install optional audio extras: pip install 'guidellm[audio]' " + "(includes websockets and torchcodec for realtime transcription)." +) + + +def _require_ws_connect() -> Any: + try: + from websockets.asyncio.client import connect as ws_connect + except ImportError as exc: + raise ImportError( + "The openai_realtime_ws backend requires the 'websockets' package. " + + _AUDIO_EXTRA_HINT + ) from exc + return ws_connect + + +def _ws_error_message(err: Any) -> str: + """Format WebSocket ``error`` for exceptions (supports dict payloads).""" + if isinstance(err, dict): + msg = err.get("message") or err.get("msg") + code = err.get("code") + parts = [str(p) for p in (code, msg) if p] + if parts: + return ": ".join(parts) + try: + return json.dumps(err)[:500] + except (TypeError, ValueError): + return repr(err) + if err is None or err == "": + return "WebSocket error" + return str(err) + + +def _model_ids_from_openai_models_payload(payload: Any) -> list[str]: + """Parse ``GET /v1/models`` JSON body; raise RuntimeError if shape is unexpected.""" + if not isinstance(payload, dict): + raise RuntimeError( + "Unexpected /v1/models response: top-level JSON must be an object, " + f"got {type(payload).__name__}" + ) + data = payload.get("data") + if not isinstance(data, list): + raise RuntimeError( + "Unexpected /v1/models response: 'data' must be a list, " + f"got {type(data).__name__}" + ) + ids: list[str] = [] + for i, item in enumerate(data): + if not isinstance(item, dict) or "id" not in item: + raise RuntimeError( + "Unexpected /v1/models response: each entry must be an object with " + f"'id' (index {i})" + ) + ids.append(str(item["id"])) + return ids + + +def _load_ws_event(raw: str) -> dict[str, Any]: + """Parse a JSON WebSocket text frame; raise RuntimeError on invalid JSON.""" + try: + parsed: Any = json.loads(raw) + except json.JSONDecodeError as exc: + raise RuntimeError( + f"Invalid JSON from realtime WebSocket: {exc.msg} at position {exc.pos}" + ) from exc + if not isinstance(parsed, dict): + raise RuntimeError( + f"Expected JSON object from realtime WebSocket, got {type(parsed).__name__}" + ) + return parsed + + +# Lazy import cache (no ``global``); tests may set ``pcm16_append_b64_chunks`` directly. +pcm16_append_b64_chunks: Any = None +_pcm_imported_fn: dict[str, Any] = {"fn": None} + + +def _ensure_pcm16_append_b64_chunks() -> Any: + if pcm16_append_b64_chunks is not None: + return pcm16_append_b64_chunks + if _pcm_imported_fn["fn"] is not None: + return _pcm_imported_fn["fn"] + try: + from guidellm.extras.audio import pcm16_append_b64_chunks as fn + except ImportError as exc: + raise ImportError( + "The openai_realtime_ws backend requires the audio extras for PCM " + "handling used in realtime transcription. " + + _AUDIO_EXTRA_HINT + ) from exc + _pcm_imported_fn["fn"] = fn + return fn + + +def _coerce_usage_int(value: Any) -> int | None: + if isinstance(value, bool): + return None + if isinstance(value, int | float): + return int(value) + if isinstance(value, str): + stripped = value.strip() + if not stripped: + return None + try: + return int(stripped) + except ValueError: + return None + return None + + +def _normalize_transcription_usage( + raw_usage: Any, +) -> dict[str, int | dict[str, int]] | None: + """Coerce OpenAI-style usage dict values to ints (including numeric strings).""" + if not isinstance(raw_usage, dict): + return None + result: dict[str, int | dict[str, int]] = {} + for key, val in raw_usage.items(): + if isinstance(val, dict): + inner: dict[str, int] = {} + for ik, iv in val.items(): + num = _coerce_usage_int(iv) + if num is not None: + inner[ik] = num + if inner: + result[key] = inner + else: + num = _coerce_usage_int(val) + if num is not None: + result[key] = num + return result if result else None + + +class OpenAIRealtimeWsBackendArgs(BackendArgs): + """Arguments for creating the realtime WebSocket backend.""" + + target: str = Field( + description=( + "HTTP(S) base URL of the server (WebSocket URL is derived from it)." + ), + json_schema_extra={ + "error_message": ( + "Backend '{backend_type}' requires --target with a valid URL." + ) + }, + ) + model: str | None = Field( + default=None, + description="Model identifier (required unless discoverable from /v1/models).", + ) + websocket_path: str = Field( + default="/v1/realtime", + description="WebSocket path on the server (default /v1/realtime).", + ) + chunk_samples: int = Field( + default=3200, + ge=1, + description="PCM16 frames per input_audio_buffer.append chunk (16 kHz).", + ) + api_key: str | None = Field(default=None, description="Bearer token if required.") + verify: bool = Field(default=False, description="Verify TLS certificates.") + timeout: float | None = Field( + default=_DEFAULT_WS_RECV_TIMEOUT, + description=( + "Per-message read timeout for WebSocket receives (seconds). " + f"Defaults to {_DEFAULT_WS_RECV_TIMEOUT}s so hung servers do not block " + "workers; use ``None`` for no limit." + ), + ) + timeout_connect: float = Field( + default=FALLBACK_TIMEOUT, + description="Timeout for establishing the WebSocket connection.", + ) + validate_backend: bool | str | dict[str, Any] = Field( + default=True, + description=( + "HTTP health check before benchmarks (same semantics as openai_http)." + ), + ) + extras: dict[str, Any] | None = Field( + default=None, + description="Extra fields merged into session.update (backend model wins).", + ) + + +@Backend.register("openai_realtime_ws") +class OpenAIRealtimeWebSocketBackend(Backend): + """WebSocket client for realtime (streaming) audio transcription.""" + + @classmethod + def backend_args(cls) -> type[BackendArgs]: + return OpenAIRealtimeWsBackendArgs + + def __init__( + self, + target: str, + model: str = "", + websocket_path: str = "/v1/realtime", + chunk_samples: int = 3200, + api_key: str | None = None, + verify: bool = False, + timeout: float | None = _DEFAULT_WS_RECV_TIMEOUT, + timeout_connect: float = FALLBACK_TIMEOUT, + validate_backend: bool | str | dict[str, Any] = True, + extras: dict[str, Any] | None = None, + ): + super().__init__(type_="openai_realtime_ws") + self.target = target.rstrip("/").removesuffix("/v1") + self.model = model or "" + self.websocket_path = websocket_path + self.chunk_samples = chunk_samples + self.api_key = api_key + self.verify = verify + self.timeout = timeout + self.timeout_connect = timeout_connect + self.api_routes = _WS_API_ROUTES + self.validate_backend: dict[str, Any] | None = self._resolve_validate_kwargs( + validate_backend + ) + self.extras = extras or {} + self._in_process = False + self._async_client: httpx.AsyncClient | None = None + + @property + def info(self) -> dict[str, Any]: + return { + "target": self.target, + "model": self.model, + "websocket_path": self.websocket_path, + "chunk_samples": self.chunk_samples, + "timeout": self.timeout, + "timeout_connect": self.timeout_connect, + "verify": self.verify, + "validate_backend": self.validate_backend, + } + + def _parsed_target(self) -> ParseResult: + raw = self.target if "://" in self.target else f"http://{self.target}" + return urlparse(raw) + + def _ws_url(self) -> str: + parsed = self._parsed_target() + if not parsed.netloc: + raise ValueError(f"Invalid target URL for WebSocket: {self.target!r}") + ws_scheme = "wss" if parsed.scheme in ("https", "wss") else "ws" + path = self.websocket_path + if not path.startswith("/"): + path = f"/{path}" + return f"{ws_scheme}://{parsed.netloc}{path}" + + def _ssl_context(self) -> ssl.SSLContext | None: + if self._parsed_target().scheme in ("http", "ws"): + return None + ctx = ssl.create_default_context() + if not self.verify: + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + return ctx + + def _resolve_validate_kwargs( + self, validate_backend: bool | str | dict[str, Any] + ) -> dict[str, Any] | None: + return resolve_openai_validate_kwargs( + validate_backend, self.target, self.api_routes + ) + + def _build_headers( + self, existing_headers: dict[str, str] | None = None + ) -> dict[str, str] | None: + return build_openai_headers(self.api_key, existing_headers) + + async def process_startup(self) -> None: + if self._in_process: + raise RuntimeError("Backend already started up for process.") + self._async_client = httpx.AsyncClient( + timeout=httpx.Timeout( + FALLBACK_TIMEOUT, + read=self.timeout, + connect=self.timeout_connect, + ), + verify=self.verify, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + keepalive_expiry=5.0, + ), + ) + self._in_process = True + + async def process_shutdown(self) -> None: + if not self._in_process: + raise RuntimeError("Backend not started up for process.") + client = self._async_client + if client is None: + raise RuntimeError("Backend not started up for process.") + await client.aclose() + self._async_client = None + self._in_process = False + + async def validate(self) -> None: + if self._async_client is None: + raise RuntimeError("Backend not started up for process.") + if not self.validate_backend: + return + validate_kwargs = {**self.validate_backend} + existing_headers = validate_kwargs.get("headers") + validate_kwargs["headers"] = self._build_headers(existing_headers) + try: + response = await self._async_client.request(**validate_kwargs) + response.raise_for_status() + except Exception as exc: + raise RuntimeError( + "Backend validation request failed. Could not connect to the server " + "or validate the backend configuration." + ) from exc + + async def available_models(self) -> list[str]: + if self._async_client is None: + raise RuntimeError("Backend not started up for process.") + target = f"{self.target}/v1/models" + response = await self._async_client.get(target, headers=self._build_headers()) + response.raise_for_status() + try: + payload: Any = response.json() + except json.JSONDecodeError as exc: + raise RuntimeError( + "Unexpected /v1/models response: body is not valid JSON" + ) from exc + return _model_ids_from_openai_models_payload(payload) + + async def default_model(self) -> str: + if self.model: + return self.model + if not self._in_process: + return "" + models = await self.available_models() + self.model = models[0] if models else "" + return self.model + + async def resolve( # noqa: C901, PLR0912, PLR0915 # type: ignore[override, misc] + self, + request: GenerationRequest, + request_info: RequestInfo, + history: list[tuple[GenerationRequest, GenerationResponse | None]] + | None = None, + ) -> AsyncIterator[tuple[GenerationResponse | None, RequestInfo]]: + if self._async_client is None: + raise RuntimeError("Backend not started up for process.") + if history: + raise NotImplementedError( + "openai_realtime_ws does not support multiturn/history yet." + ) + + audio_columns = request.columns.get("audio_column", []) + if len(audio_columns) != 1: + raise ValueError( + "Realtime transcription expects exactly one audio_column entry; " + f"got {len(audio_columns)}." + ) + + model_name = await self.default_model() + if not str(model_name).strip(): + raise RuntimeError( + "No model configured for openai_realtime_ws and /v1/models returned " + "none. Pass --model or ensure the server lists at least one model." + ) + + arguments = GenerationRequestArguments( + body={ + "model": model_name, + "websocket_path": self.websocket_path, + "chunk_samples": self.chunk_samples, + } + ) + + pcm_fn = _ensure_pcm16_append_b64_chunks() + chunks = pcm_fn( + audio_columns[0], + chunk_samples=self.chunk_samples, + ) + + session_update: dict[str, Any] = {"type": "session.update"} + if self.extras: + for key, val in self.extras.items(): + if key not in ("type", "model"): + session_update[key] = val + session_update["model"] = model_name + + ssl_ctx = self._ssl_context() + ws_headers = self._build_headers() + audio_handler = AudioRequestHandler() + full_text_parts: list[str] = [] + + try: + request_info.timings.request_start = time.time() + connect_kw: dict[str, Any] = { + "ssl": ssl_ctx, + "open_timeout": self.timeout_connect, + } + if ws_headers: + connect_kw["additional_headers"] = ws_headers + ws_connect = _require_ws_connect() + async with ws_connect(self._ws_url(), **connect_kw) as ws: + raw_first = await self._recv_ws(ws) + first_event = _load_ws_event(raw_first) + if first_event.get("type") == "error": + raise RuntimeError(_ws_error_message(first_event.get("error"))) + if first_event.get("type") != "session.created": + raise RuntimeError( + f"Expected session.created, got {first_event.get('type')!r}" + ) + await ws.send(json.dumps(session_update)) + for b64_chunk in chunks: + await ws.send( + json.dumps( + {"type": "input_audio_buffer.append", "audio": b64_chunk} + ) + ) + await ws.send( + json.dumps({"type": "input_audio_buffer.commit", "final": False}) + ) + # Sentinel end-of-stream for vLLM's audio queue + # (see RealtimeConnection). + await ws.send( + json.dumps({"type": "input_audio_buffer.commit", "final": True}) + ) + + ignored_events = 0 + while True: + raw = await self._recv_ws(ws) + event = _load_ws_event(raw) + et = event.get("type") + if et == "transcription.delta": + iter_time = time.time() + if request_info.timings.first_request_iteration is None: + request_info.timings.first_request_iteration = iter_time + request_info.timings.last_request_iteration = iter_time + request_info.timings.request_iterations += 1 + delta = event.get("delta") or "" + full_text_parts.append(delta) + if request_info.timings.first_token_iteration is None: + request_info.timings.first_token_iteration = iter_time + request_info.timings.token_iterations = 0 + yield None, request_info + request_info.timings.last_token_iteration = iter_time + request_info.timings.token_iterations += ( + 1 if delta else 0 + ) + + elif et == "transcription.done": + iter_time = time.time() + request_info.timings.request_end = iter_time + full_text = event.get("text") or "".join(full_text_parts) + if request_info.timings.first_token_iteration is None: + if request_info.timings.first_request_iteration is None: + request_info.timings.first_request_iteration = iter_time + request_info.timings.last_request_iteration = iter_time + request_info.timings.request_iterations += 1 + request_info.timings.first_token_iteration = iter_time + request_info.timings.token_iterations = 0 + yield None, request_info + request_info.timings.last_token_iteration = iter_time + request_info.timings.token_iterations += ( + 1 if full_text else 0 + ) + usage_dict = _normalize_transcription_usage(event.get("usage")) + inp, outp = audio_handler.extract_metrics(usage_dict, full_text) + yield ( + GenerationResponse( + request_id=request.request_id, + request_args=arguments.model_dump_json(), + text=full_text, + input_metrics=inp, + output_metrics=outp, + ), + request_info, + ) + break + elif et == "error": + raise RuntimeError(_ws_error_message(event.get("error"))) + else: + ignored_events += 1 + if ignored_events > _MAX_IGNORED_WS_EVENT_TYPES: + raise RuntimeError( + "Exceeded maximum ignored realtime WebSocket events " + f"without transcription.done (last type={et!r})." + ) + continue + + except asyncio.CancelledError as err: + text_so_far = "".join(full_text_parts) + inp, outp = audio_handler.extract_metrics(None, text_so_far or "") + yield ( + GenerationResponse( + request_id=request.request_id, + request_args=arguments.model_dump_json(), + text=text_so_far, + input_metrics=inp, + output_metrics=outp, + ), + request_info, + ) + raise err + finally: + if ( + request_info.timings.request_start is not None + and request_info.timings.request_end is None + ): + request_info.timings.request_end = time.time() + + async def _recv_ws(self, ws: ClientConnection) -> str: + if self.timeout is None: + msg = await ws.recv() + else: + msg = await asyncio.wait_for(ws.recv(), timeout=self.timeout) + if isinstance(msg, bytes): + return msg.decode() + return str(msg) diff --git a/tests/e2e/test_realtime_ws_e2e.py b/tests/e2e/test_realtime_ws_e2e.py new file mode 100644 index 000000000..6f3e55e70 --- /dev/null +++ b/tests/e2e/test_realtime_ws_e2e.py @@ -0,0 +1,144 @@ +"""End-to-end integration: realtime backend + PCM encoding + WebSocket (same loop). + +## WRITTEN BY AI ## +""" + +from __future__ import annotations + +import json +import socket +import struct +import wave +from collections.abc import Awaitable, Callable +from pathlib import Path +from typing import Any + +import pytest + +try: + from websockets.asyncio.server import serve +except ImportError: + pytest.skip( + "websockets not installed; install guidellm[audio] for realtime e2e", + allow_module_level=True, + ) + +from guidellm.backends.openai.realtime_ws import OpenAIRealtimeWebSocketBackend +from guidellm.schemas import GenerationRequest, RequestInfo, RequestTimings + + +def make_realtime_transcription_stub_handler( + *, + delta_text: str = "hello", + done_text: str | None = None, + usage: dict[str, Any] | None = None, + session_id: str = "stub-sess", +) -> Callable[[Any], Awaitable[None]]: + """Build an async handler that completes one transcription after two commits.""" + + resolved_done = done_text if done_text is not None else delta_text + resolved_usage = usage or { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15, + } + + async def handler(ws: Any) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": session_id, "created": 0}) + ) + commits: list[bool | None] = [] + while True: + msg = await ws.recv() + payload = json.loads(msg if isinstance(msg, str) else msg.decode()) + if payload.get("type") == "input_audio_buffer.commit": + commits.append(payload.get("final")) + if payload.get("final"): + break + assert commits == [False, True] + await ws.send( + json.dumps({"type": "transcription.delta", "delta": delta_text}) + ) + await ws.send( + json.dumps( + { + "type": "transcription.done", + "text": resolved_done, + "usage": resolved_usage, + } + ) + ) + + return handler + + +def _free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return int(s.getsockname()[1]) + + +def _write_minimal_wav_16k_mono(path: Path) -> None: + n_samples = 4000 + with wave.open(str(path), "w") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(16000) + frames = b"".join(struct.pack(" None: + """ + In-process: WebSocket server, OpenAI realtime backend, and torchcodec PCM path. + + No ``guidellm benchmark`` subprocess (avoids worker/hang issues in test envs). + For a live vLLM run, use ``scripts/e2e_realtime_external.sh``. + """ + port = _free_port() + wav_path = tmp_path / "clip.wav" + _write_minimal_wav_16k_mono(wav_path) + audio_item = { + "audio": wav_path.read_bytes(), + "file_name": "clip.wav", + "format": "wav", + } + request = GenerationRequest( + request_id="e2e-1", + columns={"audio_column": [audio_item]}, + ) + info = RequestInfo(timings=RequestTimings()) + + stub = make_realtime_transcription_stub_handler(session_id="e2e-stub-sess") + async with serve(stub, "127.0.0.1", port): + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="stub-model", + validate_backend=False, + ) + await be.process_startup() + try: + out: list = [] + async for item in be.resolve(request, info): + out.append(item) + finally: + await be.process_shutdown() + + assert len(out) == 2 + assert out[0][0] is None + final = out[1][0] + assert final is not None + assert final.text == "hello" + assert final.input_metrics.audio_tokens == 10 + assert final.output_metrics.text_tokens == 5 diff --git a/tests/unit/backends/openai/test_realtime_ws.py b/tests/unit/backends/openai/test_realtime_ws.py new file mode 100644 index 000000000..115ceb3c4 --- /dev/null +++ b/tests/unit/backends/openai/test_realtime_ws.py @@ -0,0 +1,735 @@ +"""Tests for OpenAIRealtimeWebSocketBackend. + +## WRITTEN BY AI ## +""" + +from __future__ import annotations + +import asyncio +import contextlib +import json + +import pytest + +try: + from websockets.asyncio.server import serve + from websockets.exceptions import ConnectionClosed +except ImportError: + pytest.skip( + "websockets not installed; install guidellm[audio] for realtime tests", + allow_module_level=True, + ) + +from guidellm.backends.openai.realtime_ws import ( + _DEFAULT_WS_RECV_TIMEOUT, + OpenAIRealtimeWebSocketBackend, + OpenAIRealtimeWsBackendArgs, +) +from guidellm.schemas import GenerationRequest, RequestInfo, RequestTimings + + +async def _bounded_ws_recv(ws: object, *, timeout: float = 5.0) -> None: + """Recv once with a cap so stub handlers never block ``serve()`` teardown.""" + with contextlib.suppress(asyncio.TimeoutError, ConnectionClosed): + await asyncio.wait_for(ws.recv(), timeout=timeout) + + +@pytest.mark.asyncio +async def test_resolve_streams_deltas_and_done(monkeypatch: pytest.MonkeyPatch) -> None: + """Fake server speaks vLLM-style realtime events; PCM path is patched.""" + + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "sess-x", "created": 0}) + ) + commits: list[bool | None] = [] + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if data.get("type") == "input_audio_buffer.commit": + commits.append(data.get("final")) + if data.get("final"): + break + assert commits == [False, True] + await ws.send(json.dumps({"type": "transcription.delta", "delta": "hi"})) + await ws.send( + json.dumps( + { + "type": "transcription.done", + "text": "hi", + "usage": { + "prompt_tokens": 5, + "completion_tokens": 1, + "total_tokens": 6, + }, + } + ) + ) + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YWFhYQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="test-model", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={ + "audio_column": [ + {"audio": b"fake", "format": "mp3", "file_name": "f.mp3"} + ] + }, + ) + info = RequestInfo(timings=RequestTimings()) + out: list = [] + async for item in be.resolve(req, info): + out.append(item) + await be.process_shutdown() + + assert len(out) == 2 + assert out[0][0] is None + final_resp, _ = out[1] + assert final_resp.text == "hi" + assert final_resp.input_metrics.audio_tokens == 5 + assert final_resp.output_metrics.text_tokens == 1 + + +@pytest.mark.asyncio +async def test_transcription_done_without_deltas_sets_first_token_and_prefetch_yield( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Only ``transcription.done`` (no deltas): TTFT and two yields match delta path.""" + + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "sess-x", "created": 0}) + ) + commits: list[bool | None] = [] + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if data.get("type") == "input_audio_buffer.commit": + commits.append(data.get("final")) + if data.get("final"): + break + assert commits == [False, True] + await ws.send( + json.dumps( + { + "type": "transcription.done", + "text": "only-done", + "usage": { + "prompt_tokens": 2, + "completion_tokens": 7, + "total_tokens": 9, + }, + } + ) + ) + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YWFhYQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="test-model", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={ + "audio_column": [ + {"audio": b"fake", "format": "mp3", "file_name": "f.mp3"} + ] + }, + ) + info = RequestInfo(timings=RequestTimings()) + out: list = [] + async for item in be.resolve(req, info): + out.append(item) + await be.process_shutdown() + + assert len(out) == 2 + prefetch, prefetch_info = out[0] + assert prefetch is None + assert prefetch_info.timings.first_token_iteration is not None + assert prefetch_info.timings.last_token_iteration is not None + assert prefetch_info.timings.token_iterations == 1 + final_resp, final_info = out[1] + assert final_resp.text == "only-done" + assert final_resp.input_metrics.audio_tokens == 2 + assert final_resp.output_metrics.text_tokens == 7 + assert final_info.timings.request_end is not None + + +@pytest.mark.asyncio +async def test_transcription_done_usage_string_counts( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """String token counts in usage should still feed AudioRequestHandler metrics.""" + + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "sess-x", "created": 0}) + ) + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if ( + data.get("type") == "input_audio_buffer.commit" + and data.get("final") + ): + break + await ws.send(json.dumps({"type": "transcription.delta", "delta": "x"})) + await ws.send( + json.dumps( + { + "type": "transcription.done", + "text": "x", + "usage": { + "prompt_tokens": "12", + "completion_tokens": "3", + "total_tokens": "15", + }, + } + ) + ) + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YWFhYQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="test-model", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={ + "audio_column": [ + {"audio": b"fake", "format": "mp3", "file_name": "f.mp3"} + ] + }, + ) + info = RequestInfo(timings=RequestTimings()) + out: list = [] + async for item in be.resolve(req, info): + out.append(item) + await be.process_shutdown() + + final_resp, _ = out[1] + assert final_resp.input_metrics.audio_tokens == 12 + assert final_resp.output_metrics.text_tokens == 3 + + +@pytest.mark.asyncio +async def test_server_error_event_raises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "sess-x", "created": 0}) + ) + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if ( + data.get("type") == "input_audio_buffer.commit" + and not data.get("final") + ): + await ws.send( + json.dumps({"type": "error", "error": "bad", "code": "e1"}) + ) + await _bounded_ws_recv(ws) + return + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="bad"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_first_message_error_event_raises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + async def handler(ws: object) -> None: + await ws.send(json.dumps({"type": "error", "error": "auth failed"})) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="auth failed"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_first_message_not_session_created_raises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + async def handler(ws: object) -> None: + await ws.send(json.dumps({"type": "unexpected.ping"})) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="session.created"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_invalid_json_from_server_raises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "sess-x", "created": 0}) + ) + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if ( + data.get("type") == "input_audio_buffer.commit" + and data.get("final") + ): + break + await ws.send("{not-json") + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="Invalid JSON"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_resolve_requires_process_startup() -> None: + be = OpenAIRealtimeWebSocketBackend( + target="http://127.0.0.1:9", + model="m", + validate_backend=False, + ) + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="started"): + async for _ in be.resolve(req, info): + pass + + +@pytest.mark.asyncio +async def test_resolve_rejects_history() -> None: + be = OpenAIRealtimeWebSocketBackend( + target="http://127.0.0.1:9", + model="m", + validate_backend=False, + ) + await be.process_startup() + prev = GenerationRequest(request_id="prev", columns={}) + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(NotImplementedError, match="history"): + async for _ in be.resolve(req, info, history=[(prev, None)]): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_resolve_rejects_wrong_audio_column_count() -> None: + be = OpenAIRealtimeWebSocketBackend( + target="http://127.0.0.1:9", + model="m", + validate_backend=False, + ) + await be.process_startup() + info = RequestInfo(timings=RequestTimings()) + + async def drain(req: GenerationRequest) -> None: + async for _ in be.resolve(req, info): + pass + + req_empty = GenerationRequest(request_id="r1", columns={"audio_column": []}) + with pytest.raises(ValueError, match="exactly one"): + await drain(req_empty) + req_two = GenerationRequest( + request_id="r2", + columns={"audio_column": [{"audio": b"a"}, {"audio": b"b"}]}, + ) + with pytest.raises(ValueError, match="exactly one"): + await drain(req_two) + await be.process_shutdown() + + +@pytest.mark.asyncio +@pytest.mark.timeout(45) +async def test_resolve_cancelled_after_delta_yields_partial_then_reraises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + delta_seen = asyncio.Event() + + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "s", "created": 0}) + ) + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if ( + data.get("type") == "input_audio_buffer.commit" + and data.get("final") + ): + break + await ws.send( + json.dumps({"type": "transcription.delta", "delta": "partial"}) + ) + delta_seen.set() + await _bounded_ws_recv(ws) + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + results: list = [] + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + + async def collect() -> None: + async for item in be.resolve(req, info): + results.append(item) + + task = asyncio.create_task(collect()) + await asyncio.wait_for(delta_seen.wait(), timeout=5.0) + await asyncio.sleep(0.05) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + await be.process_shutdown() + + assert len(results) == 2 + assert results[0][0] is None + assert results[1][0] is not None + assert results[1][0].text == "partial" + + +@pytest.mark.asyncio +async def test_non_object_json_after_handshake_raises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "s", "created": 0}) + ) + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if ( + data.get("type") == "input_audio_buffer.commit" + and data.get("final") + ): + break + await ws.send("[]") + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="JSON object"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_excessive_ignored_events_raises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws._MAX_IGNORED_WS_EVENT_TYPES", + 2, + ) + + async def handler(ws: object) -> None: + await ws.send( + json.dumps({"type": "session.created", "id": "s", "created": 0}) + ) + while True: + msg = await ws.recv() + data = json.loads(msg if isinstance(msg, str) else msg.decode()) + if ( + data.get("type") == "input_audio_buffer.commit" + and data.get("final") + ): + break + for _ in range(10): + await ws.send(json.dumps({"type": "noise.event"})) + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="Exceeded maximum"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_available_models_parses_response(httpx_mock: object) -> None: + httpx_mock.add_response( + url="http://127.0.0.1:9/v1/models", + json={"data": [{"id": "a"}, {"id": "b"}]}, + ) + be = OpenAIRealtimeWebSocketBackend( + target="http://127.0.0.1:9", + validate_backend=False, + ) + await be.process_startup() + assert await be.available_models() == ["a", "b"] + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_available_models_bad_data_shape_raises(httpx_mock: object) -> None: + httpx_mock.add_response( + url="http://127.0.0.1:9/v1/models", + json={"data": "not-a-list"}, + ) + be = OpenAIRealtimeWebSocketBackend( + target="http://127.0.0.1:9", + validate_backend=False, + ) + await be.process_startup() + with pytest.raises(RuntimeError, match="list"): + await be.available_models() + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_resolve_raises_when_no_model_and_empty_catalog( + httpx_mock: object, + monkeypatch: pytest.MonkeyPatch, +) -> None: + httpx_mock.add_response( + url="http://127.0.0.1:9/v1/models", + json={"data": []}, + ) + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + be = OpenAIRealtimeWebSocketBackend( + target="http://127.0.0.1:9", + model="", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="No model configured"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_resolve_invalid_ws_target_url_raises( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + be = OpenAIRealtimeWebSocketBackend( + target="", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(ValueError, match="Invalid target"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +@pytest.mark.asyncio +async def test_error_event_dict_formatted_message( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def handler(ws: object) -> None: + await ws.send( + json.dumps( + { + "type": "error", + "error": {"message": "auth failed", "code": "401"}, + } + ) + ) + + monkeypatch.setattr( + "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + lambda *a, **k: ["YQ=="], + ) + + async with serve(handler, "127.0.0.1", 0) as server: + port = server.sockets[0].getsockname()[1] + be = OpenAIRealtimeWebSocketBackend( + target=f"http://127.0.0.1:{port}", + model="m", + validate_backend=False, + ) + await be.process_startup() + req = GenerationRequest( + request_id="r1", + columns={"audio_column": [{"audio": b"x"}]}, + ) + info = RequestInfo(timings=RequestTimings()) + with pytest.raises(RuntimeError, match="401"): + async for _ in be.resolve(req, info): + pass + await be.process_shutdown() + + +def test_openai_realtime_backend_args_model() -> None: + a = OpenAIRealtimeWsBackendArgs(target="http://localhost:8000", model="x") + assert a.websocket_path == "/v1/realtime" + assert a.chunk_samples == 3200 + assert a.timeout == _DEFAULT_WS_RECV_TIMEOUT diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py index 4dbb76ac2..13c707496 100644 --- a/tests/unit/backends/test_backend.py +++ b/tests/unit/backends/test_backend.py @@ -319,7 +319,10 @@ async def default_model(self) -> str: @pytest.mark.smoke def test_openai_realtime_ws_backend_registered(self): - """Realtime WebSocket backend is registered and constructible.""" + """Realtime WebSocket backend is registered and constructible. + + ## WRITTEN BY AI ## + """ from guidellm.backends.openai import ( OpenAIRealtimeWebSocketBackend, OpenAIRealtimeWsBackendArgs, diff --git a/tests/unit/benchmark/schemas/generative/test_entrypoints.py b/tests/unit/benchmark/schemas/generative/test_entrypoints.py index f3758ff1e..b251bb096 100644 --- a/tests/unit/benchmark/schemas/generative/test_entrypoints.py +++ b/tests/unit/benchmark/schemas/generative/test_entrypoints.py @@ -54,7 +54,10 @@ def test_dict_backend_kwargs_transformed(self): assert args.backend_kwargs.model == "test_model" def test_openai_realtime_ws_backend_kwargs_validates(self) -> None: - """Realtime WS backend is selected explicitly; no request_format shim.""" + """Realtime WS backend is selected explicitly; no request_format shim. + + ## WRITTEN BY AI ## + """ args = BenchmarkGenerativeTextArgs.model_validate( { "backend": "openai_realtime_ws", diff --git a/tests/unit/extras/test_audio.py b/tests/unit/extras/test_audio.py index c890a21cb..de5ca6250 100644 --- a/tests/unit/extras/test_audio.py +++ b/tests/unit/extras/test_audio.py @@ -198,6 +198,7 @@ def test_end_to_end_audio_processing(sample_audio_tensor): @patch("guidellm.extras.audio._decode_audio") def test_pcm16_append_b64_chunks_rejects_unknown_dict_keys(mock_decode): + """## WRITTEN BY AI ##""" mock_decode.side_effect = AssertionError("_decode_audio should not run") with pytest.raises(ValueError, match="audio_column dict"): pcm16_append_b64_chunks({"foo": 1}) @@ -205,6 +206,7 @@ def test_pcm16_append_b64_chunks_rejects_unknown_dict_keys(mock_decode): @patch("guidellm.extras.audio._decode_audio") def test_pcm16_append_b64_chunks_splits_into_multiple_base64_chunks(mock_decode): + """## WRITTEN BY AI ##""" mock_decode.return_value = MagicMock() mock_decode.return_value.data = torch.zeros(1, 5000) mock_decode.return_value.sample_rate = 16000 @@ -217,6 +219,7 @@ def test_pcm16_append_b64_chunks_splits_into_multiple_base64_chunks(mock_decode) @patch("guidellm.extras.audio._decode_audio") def test_pcm16_append_b64_chunks_empty_wave_raises(mock_decode): + """## WRITTEN BY AI ##""" mock_decode.return_value = MagicMock() mock_decode.return_value.data = torch.zeros(1, 0) mock_decode.return_value.sample_rate = 16000 @@ -227,6 +230,7 @@ def test_pcm16_append_b64_chunks_empty_wave_raises(mock_decode): @patch("guidellm.extras.audio._decode_audio") def test_pcm16_append_b64_chunks_downmixes_stereo(mock_decode): + """## WRITTEN BY AI ##""" mock_decode.return_value = MagicMock() mock_decode.return_value.data = torch.randn(2, 200) mock_decode.return_value.sample_rate = 16000 @@ -238,6 +242,7 @@ def test_pcm16_append_b64_chunks_downmixes_stereo(mock_decode): @patch("guidellm.extras.audio._decode_audio") def test_pcm16_append_b64_chunks_audio_dict_passes_outer_sample_rate(mock_decode): + """## WRITTEN BY AI ##""" mock_decode.return_value = MagicMock() mock_decode.return_value.data = torch.zeros(1, 100) mock_decode.return_value.sample_rate = 16000 @@ -250,6 +255,7 @@ def test_pcm16_append_b64_chunks_audio_dict_passes_outer_sample_rate(mock_decode @patch("guidellm.extras.audio._decode_audio") def test_pcm16_append_b64_chunks_sampling_rate_alias(mock_decode): + """## WRITTEN BY AI ##""" mock_decode.return_value = MagicMock() mock_decode.return_value.data = torch.zeros(1, 50) mock_decode.return_value.sample_rate = 16000 @@ -261,6 +267,7 @@ def test_pcm16_append_b64_chunks_sampling_rate_alias(mock_decode): @patch("guidellm.extras.audio._decode_audio") def test_pcm16_append_b64_chunks_invalid_decoder_sample_rate_raises(mock_decode): + """## WRITTEN BY AI ##""" mock_decode.return_value = MagicMock() mock_decode.return_value.data = torch.zeros(1, 10) mock_decode.return_value.sample_rate = 0 From 949373984d8bd245b2d6c243fe643aea6c32fbfc Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 10:37:19 +0300 Subject: [PATCH 03/12] lint Signed-off-by: Uri Shaket --- src/guidellm/backends/openai/http.py | 1 + src/guidellm/backends/openai/realtime_ws.py | 9 ++-- src/guidellm/extras/audio.py | 10 ++-- tests/e2e/test_realtime_ws_e2e.py | 4 +- .../unit/backends/openai/test_realtime_ws.py | 46 +++++-------------- 5 files changed, 23 insertions(+), 47 deletions(-) diff --git a/src/guidellm/backends/openai/http.py b/src/guidellm/backends/openai/http.py index ef83b4265..e3e45c845 100644 --- a/src/guidellm/backends/openai/http.py +++ b/src/guidellm/backends/openai/http.py @@ -128,6 +128,7 @@ def validate_request_format(cls, v: str | None) -> str | None: "audio_translations": "/v1/audio/translations", } + @Backend.register("openai_http") class OpenAIHTTPBackend(Backend): """ diff --git a/src/guidellm/backends/openai/realtime_ws.py b/src/guidellm/backends/openai/realtime_ws.py index a84f0d5a9..417296193 100644 --- a/src/guidellm/backends/openai/realtime_ws.py +++ b/src/guidellm/backends/openai/realtime_ws.py @@ -143,8 +143,7 @@ def _ensure_pcm16_append_b64_chunks() -> Any: except ImportError as exc: raise ImportError( "The openai_realtime_ws backend requires the audio extras for PCM " - "handling used in realtime transcription. " - + _AUDIO_EXTRA_HINT + "handling used in realtime transcription. " + _AUDIO_EXTRA_HINT ) from exc _pcm_imported_fn["fn"] = fn return fn @@ -395,7 +394,7 @@ async def default_model(self) -> str: self.model = models[0] if models else "" return self.model - async def resolve( # noqa: C901, PLR0912, PLR0915 # type: ignore[override, misc] + async def resolve( # type: ignore[override, misc] # noqa: C901, PLR0912, PLR0915 self, request: GenerationRequest, request_info: RequestInfo, @@ -501,9 +500,7 @@ async def resolve( # noqa: C901, PLR0912, PLR0915 # type: ignore[override, mis request_info.timings.token_iterations = 0 yield None, request_info request_info.timings.last_token_iteration = iter_time - request_info.timings.token_iterations += ( - 1 if delta else 0 - ) + request_info.timings.token_iterations += 1 if delta else 0 elif et == "transcription.done": iter_time = time.time() diff --git a/src/guidellm/extras/audio.py b/src/guidellm/extras/audio.py index e8bcd6cde..acdfb3b0f 100644 --- a/src/guidellm/extras/audio.py +++ b/src/guidellm/extras/audio.py @@ -301,9 +301,13 @@ def pcm16_append_b64_chunks( # Convert float waveform to signed little-endian PCM16 bytes. wave = data.squeeze(0) pcm_i16 = ( - wave.clamp(_PCM16_WAVE_CLIP_MIN, _PCM16_WAVE_CLIP_MAX) - * _PCM16_FLOAT_TO_INT16_SCALE - ).round().to(torch.int16) + ( + wave.clamp(_PCM16_WAVE_CLIP_MIN, _PCM16_WAVE_CLIP_MAX) + * _PCM16_FLOAT_TO_INT16_SCALE + ) + .round() + .to(torch.int16) + ) buf = pcm_i16.cpu().numpy().tobytes() # Split PCM bytes into chunk-sized base64 payloads for append events. diff --git a/tests/e2e/test_realtime_ws_e2e.py b/tests/e2e/test_realtime_ws_e2e.py index 6f3e55e70..160f76ca0 100644 --- a/tests/e2e/test_realtime_ws_e2e.py +++ b/tests/e2e/test_realtime_ws_e2e.py @@ -56,9 +56,7 @@ async def handler(ws: Any) -> None: if payload.get("final"): break assert commits == [False, True] - await ws.send( - json.dumps({"type": "transcription.delta", "delta": delta_text}) - ) + await ws.send(json.dumps({"type": "transcription.delta", "delta": delta_text})) await ws.send( json.dumps( { diff --git a/tests/unit/backends/openai/test_realtime_ws.py b/tests/unit/backends/openai/test_realtime_ws.py index 115ceb3c4..b00821d53 100644 --- a/tests/unit/backends/openai/test_realtime_ws.py +++ b/tests/unit/backends/openai/test_realtime_ws.py @@ -187,10 +187,7 @@ async def handler(ws: object) -> None: while True: msg = await ws.recv() data = json.loads(msg if isinstance(msg, str) else msg.decode()) - if ( - data.get("type") == "input_audio_buffer.commit" - and data.get("final") - ): + if data.get("type") == "input_audio_buffer.commit" and data.get("final"): break await ws.send(json.dumps({"type": "transcription.delta", "delta": "x"})) await ws.send( @@ -250,9 +247,8 @@ async def handler(ws: object) -> None: while True: msg = await ws.recv() data = json.loads(msg if isinstance(msg, str) else msg.decode()) - if ( - data.get("type") == "input_audio_buffer.commit" - and not data.get("final") + if data.get("type") == "input_audio_buffer.commit" and not data.get( + "final" ): await ws.send( json.dumps({"type": "error", "error": "bad", "code": "e1"}) @@ -357,10 +353,7 @@ async def handler(ws: object) -> None: while True: msg = await ws.recv() data = json.loads(msg if isinstance(msg, str) else msg.decode()) - if ( - data.get("type") == "input_audio_buffer.commit" - and data.get("final") - ): + if data.get("type") == "input_audio_buffer.commit" and data.get("final"): break await ws.send("{not-json") @@ -459,20 +452,13 @@ async def test_resolve_cancelled_after_delta_yields_partial_then_reraises( delta_seen = asyncio.Event() async def handler(ws: object) -> None: - await ws.send( - json.dumps({"type": "session.created", "id": "s", "created": 0}) - ) + await ws.send(json.dumps({"type": "session.created", "id": "s", "created": 0})) while True: msg = await ws.recv() data = json.loads(msg if isinstance(msg, str) else msg.decode()) - if ( - data.get("type") == "input_audio_buffer.commit" - and data.get("final") - ): + if data.get("type") == "input_audio_buffer.commit" and data.get("final"): break - await ws.send( - json.dumps({"type": "transcription.delta", "delta": "partial"}) - ) + await ws.send(json.dumps({"type": "transcription.delta", "delta": "partial"})) delta_seen.set() await _bounded_ws_recv(ws) @@ -519,16 +505,11 @@ async def test_non_object_json_after_handshake_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: async def handler(ws: object) -> None: - await ws.send( - json.dumps({"type": "session.created", "id": "s", "created": 0}) - ) + await ws.send(json.dumps({"type": "session.created", "id": "s", "created": 0})) while True: msg = await ws.recv() data = json.loads(msg if isinstance(msg, str) else msg.decode()) - if ( - data.get("type") == "input_audio_buffer.commit" - and data.get("final") - ): + if data.get("type") == "input_audio_buffer.commit" and data.get("final"): break await ws.send("[]") @@ -566,16 +547,11 @@ async def test_excessive_ignored_events_raises( ) async def handler(ws: object) -> None: - await ws.send( - json.dumps({"type": "session.created", "id": "s", "created": 0}) - ) + await ws.send(json.dumps({"type": "session.created", "id": "s", "created": 0})) while True: msg = await ws.recv() data = json.loads(msg if isinstance(msg, str) else msg.decode()) - if ( - data.get("type") == "input_audio_buffer.commit" - and data.get("final") - ): + if data.get("type") == "input_audio_buffer.commit" and data.get("final"): break for _ in range(10): await ws.send(json.dumps({"type": "noise.event"})) From 8fee38a887d845716381a11095d75e97e14116be Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 17:25:03 +0300 Subject: [PATCH 04/12] Update src/guidellm/backends/openai/openai_common.py Co-authored-by: Samuel Monson Signed-off-by: Uri Shaket --- src/guidellm/backends/openai/openai_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/backends/openai/openai_common.py b/src/guidellm/backends/openai/openai_common.py index c35d3d6bd..bc86c9245 100644 --- a/src/guidellm/backends/openai/openai_common.py +++ b/src/guidellm/backends/openai/openai_common.py @@ -4,7 +4,7 @@ from typing import Any -# NOTE: Matches httpx's default connect timeout; shared by HTTP and WebSocket backends. +# NOTE: This value is taken from httpx's default FALLBACK_TIMEOUT = 5.0 From f60beea9045e51c8922ba7362300412288137190 Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 17:25:14 +0300 Subject: [PATCH 05/12] Update src/guidellm/backends/openai/openai_common.py Co-authored-by: Samuel Monson Signed-off-by: Uri Shaket --- src/guidellm/backends/openai/openai_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/guidellm/backends/openai/openai_common.py b/src/guidellm/backends/openai/openai_common.py index bc86c9245..269fd24c8 100644 --- a/src/guidellm/backends/openai/openai_common.py +++ b/src/guidellm/backends/openai/openai_common.py @@ -8,7 +8,7 @@ FALLBACK_TIMEOUT = 5.0 -def build_openai_headers( +def build_headers( api_key: str | None, existing_headers: dict[str, str] | None = None, ) -> dict[str, str] | None: From 5892e145abb83a0f394eed5fbb968e82a9f32fde Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 17:35:38 +0300 Subject: [PATCH 06/12] CR Signed-off-by: Uri Shaket --- pyproject.toml | 3 +- src/guidellm/backends/__init__.py | 8 +- src/guidellm/backends/openai/__init__.py | 6 +- src/guidellm/backends/openai/http.py | 16 +- src/guidellm/backends/openai/openai_common.py | 68 --- src/guidellm/backends/openai/realtime_ws.py | 573 ------------------ tests/e2e/test_realtime_ws_e2e.py | 4 +- .../unit/backends/openai/test_realtime_ws.py | 74 +-- tests/unit/backends/test_backend.py | 8 +- .../schemas/generative/test_entrypoints.py | 4 +- 10 files changed, 58 insertions(+), 706 deletions(-) delete mode 100644 src/guidellm/backends/openai/openai_common.py delete mode 100644 src/guidellm/backends/openai/realtime_ws.py diff --git a/pyproject.toml b/pyproject.toml index a148dd54c..420b4cf0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,8 +86,7 @@ audio = [ # Torchcodec needs specific torch version "torch==2.10.*", "torchcodec==0.10.*", - # openai_realtime_ws backend (vLLM /v1/realtime) - "websockets>=13.0,<16.0", + "websockets>=13.0", ] vision = [ "datasets[vision]", diff --git a/src/guidellm/backends/__init__.py b/src/guidellm/backends/__init__.py index d423daf54..812e70f5b 100644 --- a/src/guidellm/backends/__init__.py +++ b/src/guidellm/backends/__init__.py @@ -16,8 +16,8 @@ AudioRequestHandler, ChatCompletionsRequestHandler, OpenAIHTTPBackend, - OpenAIRealtimeWebSocketBackend, - OpenAIRealtimeWsBackendArgs, + OpenAIWebSocketBackend, + OpenAIWebSocketBackendArgs, OpenAIRequestHandler, OpenAIRequestHandlerFactory, TextCompletionsRequestHandler, @@ -37,8 +37,8 @@ "BackendType", "ChatCompletionsRequestHandler", "OpenAIHTTPBackend", - "OpenAIRealtimeWebSocketBackend", - "OpenAIRealtimeWsBackendArgs", + "OpenAIWebSocketBackend", + "OpenAIWebSocketBackendArgs", "OpenAIRequestHandler", "OpenAIRequestHandlerFactory", "TextCompletionsRequestHandler", diff --git a/src/guidellm/backends/openai/__init__.py b/src/guidellm/backends/openai/__init__.py index da4f2bbb7..fe781e2bc 100644 --- a/src/guidellm/backends/openai/__init__.py +++ b/src/guidellm/backends/openai/__init__.py @@ -1,5 +1,5 @@ from .http import OpenAIHTTPBackend -from .realtime_ws import OpenAIRealtimeWebSocketBackend, OpenAIRealtimeWsBackendArgs +from .websocket import OpenAIWebSocketBackend, OpenAIWebSocketBackendArgs from .request_handlers import ( AudioRequestHandler, ChatCompletionsRequestHandler, @@ -13,8 +13,8 @@ "AudioRequestHandler", "ChatCompletionsRequestHandler", "OpenAIHTTPBackend", - "OpenAIRealtimeWebSocketBackend", - "OpenAIRealtimeWsBackendArgs", + "OpenAIWebSocketBackend", + "OpenAIWebSocketBackendArgs", "OpenAIRequestHandler", "OpenAIRequestHandlerFactory", "ResponsesRequestHandler", diff --git a/src/guidellm/backends/openai/http.py b/src/guidellm/backends/openai/http.py index e3e45c845..a96d0f19b 100644 --- a/src/guidellm/backends/openai/http.py +++ b/src/guidellm/backends/openai/http.py @@ -19,10 +19,10 @@ from pydantic import Field, field_validator from guidellm.backends.backend import Backend, BackendArgs -from guidellm.backends.openai.openai_common import ( +from guidellm.backends.openai.common import ( FALLBACK_TIMEOUT, - build_openai_headers, - resolve_openai_validate_kwargs, + build_headers, + resolve_validate_kwargs, ) from guidellm.backends.openai.request_handlers import OpenAIRequestHandlerFactory from guidellm.schemas import ( @@ -232,7 +232,7 @@ def __init__( self.http2 = http2 self.follow_redirects = follow_redirects self.verify = verify - self.validate_backend: dict[str, Any] | None = self._resolve_validate_kwargs( + self.validate_backend: dict[str, Any] | None = resolve_validate_kwargs( validate_backend ) self.stream: bool = stream @@ -503,11 +503,5 @@ async def _aiter_lines(self, stream: httpx.Response) -> AsyncIterator[str]: def _build_headers( self, existing_headers: dict[str, str] | None = None ) -> dict[str, str] | None: - return build_openai_headers(self.api_key, existing_headers) + return build_headers(self.api_key, existing_headers) - def _resolve_validate_kwargs( - self, validate_backend: bool | str | dict[str, Any] - ) -> dict[str, Any] | None: - return resolve_openai_validate_kwargs( - validate_backend, self.target, self.api_routes - ) diff --git a/src/guidellm/backends/openai/openai_common.py b/src/guidellm/backends/openai/openai_common.py deleted file mode 100644 index 269fd24c8..000000000 --- a/src/guidellm/backends/openai/openai_common.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Shared helpers for OpenAI-compatible HTTP and WebSocket backends.""" - -from __future__ import annotations - -from typing import Any - -# NOTE: This value is taken from httpx's default -FALLBACK_TIMEOUT = 5.0 - - -def build_headers( - api_key: str | None, - existing_headers: dict[str, str] | None = None, -) -> dict[str, str] | None: - """ - Build headers with bearer authentication for OpenAI-compatible requests. - - Merges the Authorization bearer token (if ``api_key`` is set) with any - existing headers. User-provided headers take precedence over the bearer token. - - :param api_key: Optional API key for Bearer authentication - :param existing_headers: Optional headers to merge in - :return: Headers dict, or ``None`` if there are no headers to send - """ - headers: dict[str, str] = {} - if api_key: - headers["Authorization"] = f"Bearer {api_key}" - if existing_headers: - headers = {**headers, **existing_headers} - return headers or None - - -def resolve_openai_validate_kwargs( - validate_backend: bool | str | dict[str, Any], - target: str, - api_routes: dict[str, str], -) -> dict[str, Any] | None: - """ - Normalize ``validate_backend`` into kwargs for ``httpx`` request(). - - Accepts the same shapes as - :class:`~guidellm.backends.openai.http.OpenAIHTTPBackend`. - """ - raw = validate_backend - if not raw: - return None - - if raw is True: - raw = "/health" - - if isinstance(raw, str): - url = f"{target}/{api_routes[raw]}" if raw in api_routes else raw - request_kwargs: dict[str, Any] = {"method": "GET", "url": url} - elif isinstance(raw, dict): - request_kwargs = raw - else: - request_kwargs = raw - - if not isinstance(request_kwargs, dict) or "url" not in request_kwargs: - raise ValueError( - "validate_backend must be a boolean, string, or dictionary and contain " - f"a target URL. Got: {request_kwargs}" - ) - - if "method" not in request_kwargs: - request_kwargs["method"] = "GET" - - return request_kwargs diff --git a/src/guidellm/backends/openai/realtime_ws.py b/src/guidellm/backends/openai/realtime_ws.py deleted file mode 100644 index 417296193..000000000 --- a/src/guidellm/backends/openai/realtime_ws.py +++ /dev/null @@ -1,573 +0,0 @@ -""" -WebSocket backend for vLLM-compatible realtime audio transcription. - -Implements the JSON event protocol used by vLLM's ``/v1/realtime`` endpoint: -``session.created`` → ``session.update`` → ``input_audio_buffer.append`` → -``input_audio_buffer.commit`` (``final: false`` starts transcription, then -``final: true`` ends the audio stream) → ``transcription.delta`` / -``transcription.done``. -""" - -from __future__ import annotations - -import asyncio -import json -import ssl -import time -from collections.abc import AsyncIterator -from typing import TYPE_CHECKING, Any -from urllib.parse import ParseResult, urlparse - -import httpx -from pydantic import Field - -if TYPE_CHECKING: - from websockets.asyncio.client import ClientConnection - -from guidellm.backends.backend import Backend, BackendArgs -from guidellm.backends.openai.openai_common import ( - FALLBACK_TIMEOUT, - build_openai_headers, - resolve_openai_validate_kwargs, -) -from guidellm.backends.openai.request_handlers import AudioRequestHandler -from guidellm.schemas import ( - GenerationRequest, - GenerationRequestArguments, - GenerationResponse, - RequestInfo, -) - -__all__ = [ - "OpenAIRealtimeWebSocketBackend", - "OpenAIRealtimeWsBackendArgs", -] - -_WS_API_ROUTES = { - "/health": "health", - "/v1/models": "v1/models", -} - -# Guard against a misbehaving server that only emits ignored event types. -_MAX_IGNORED_WS_EVENT_TYPES = 50_000 - -# Per-message WebSocket recv timeout default so benchmark workers do not hang forever -# on a silent peer. Pass ``timeout=None`` to wait indefinitely. -_DEFAULT_WS_RECV_TIMEOUT = 120.0 - -_AUDIO_EXTRA_HINT = ( - "Install optional audio extras: pip install 'guidellm[audio]' " - "(includes websockets and torchcodec for realtime transcription)." -) - - -def _require_ws_connect() -> Any: - try: - from websockets.asyncio.client import connect as ws_connect - except ImportError as exc: - raise ImportError( - "The openai_realtime_ws backend requires the 'websockets' package. " - + _AUDIO_EXTRA_HINT - ) from exc - return ws_connect - - -def _ws_error_message(err: Any) -> str: - """Format WebSocket ``error`` for exceptions (supports dict payloads).""" - if isinstance(err, dict): - msg = err.get("message") or err.get("msg") - code = err.get("code") - parts = [str(p) for p in (code, msg) if p] - if parts: - return ": ".join(parts) - try: - return json.dumps(err)[:500] - except (TypeError, ValueError): - return repr(err) - if err is None or err == "": - return "WebSocket error" - return str(err) - - -def _model_ids_from_openai_models_payload(payload: Any) -> list[str]: - """Parse ``GET /v1/models`` JSON body; raise RuntimeError if shape is unexpected.""" - if not isinstance(payload, dict): - raise RuntimeError( - "Unexpected /v1/models response: top-level JSON must be an object, " - f"got {type(payload).__name__}" - ) - data = payload.get("data") - if not isinstance(data, list): - raise RuntimeError( - "Unexpected /v1/models response: 'data' must be a list, " - f"got {type(data).__name__}" - ) - ids: list[str] = [] - for i, item in enumerate(data): - if not isinstance(item, dict) or "id" not in item: - raise RuntimeError( - "Unexpected /v1/models response: each entry must be an object with " - f"'id' (index {i})" - ) - ids.append(str(item["id"])) - return ids - - -def _load_ws_event(raw: str) -> dict[str, Any]: - """Parse a JSON WebSocket text frame; raise RuntimeError on invalid JSON.""" - try: - parsed: Any = json.loads(raw) - except json.JSONDecodeError as exc: - raise RuntimeError( - f"Invalid JSON from realtime WebSocket: {exc.msg} at position {exc.pos}" - ) from exc - if not isinstance(parsed, dict): - raise RuntimeError( - f"Expected JSON object from realtime WebSocket, got {type(parsed).__name__}" - ) - return parsed - - -# Lazy import cache (no ``global``); tests may set ``pcm16_append_b64_chunks`` directly. -pcm16_append_b64_chunks: Any = None -_pcm_imported_fn: dict[str, Any] = {"fn": None} - - -def _ensure_pcm16_append_b64_chunks() -> Any: - if pcm16_append_b64_chunks is not None: - return pcm16_append_b64_chunks - if _pcm_imported_fn["fn"] is not None: - return _pcm_imported_fn["fn"] - try: - from guidellm.extras.audio import pcm16_append_b64_chunks as fn - except ImportError as exc: - raise ImportError( - "The openai_realtime_ws backend requires the audio extras for PCM " - "handling used in realtime transcription. " + _AUDIO_EXTRA_HINT - ) from exc - _pcm_imported_fn["fn"] = fn - return fn - - -def _coerce_usage_int(value: Any) -> int | None: - if isinstance(value, bool): - return None - if isinstance(value, int | float): - return int(value) - if isinstance(value, str): - stripped = value.strip() - if not stripped: - return None - try: - return int(stripped) - except ValueError: - return None - return None - - -def _normalize_transcription_usage( - raw_usage: Any, -) -> dict[str, int | dict[str, int]] | None: - """Coerce OpenAI-style usage dict values to ints (including numeric strings).""" - if not isinstance(raw_usage, dict): - return None - result: dict[str, int | dict[str, int]] = {} - for key, val in raw_usage.items(): - if isinstance(val, dict): - inner: dict[str, int] = {} - for ik, iv in val.items(): - num = _coerce_usage_int(iv) - if num is not None: - inner[ik] = num - if inner: - result[key] = inner - else: - num = _coerce_usage_int(val) - if num is not None: - result[key] = num - return result if result else None - - -class OpenAIRealtimeWsBackendArgs(BackendArgs): - """Arguments for creating the realtime WebSocket backend.""" - - target: str = Field( - description=( - "HTTP(S) base URL of the server (WebSocket URL is derived from it)." - ), - json_schema_extra={ - "error_message": ( - "Backend '{backend_type}' requires --target with a valid URL." - ) - }, - ) - model: str | None = Field( - default=None, - description="Model identifier (required unless discoverable from /v1/models).", - ) - websocket_path: str = Field( - default="/v1/realtime", - description="WebSocket path on the server (default /v1/realtime).", - ) - chunk_samples: int = Field( - default=3200, - ge=1, - description="PCM16 frames per input_audio_buffer.append chunk (16 kHz).", - ) - api_key: str | None = Field(default=None, description="Bearer token if required.") - verify: bool = Field(default=False, description="Verify TLS certificates.") - timeout: float | None = Field( - default=_DEFAULT_WS_RECV_TIMEOUT, - description=( - "Per-message read timeout for WebSocket receives (seconds). " - f"Defaults to {_DEFAULT_WS_RECV_TIMEOUT}s so hung servers do not block " - "workers; use ``None`` for no limit." - ), - ) - timeout_connect: float = Field( - default=FALLBACK_TIMEOUT, - description="Timeout for establishing the WebSocket connection.", - ) - validate_backend: bool | str | dict[str, Any] = Field( - default=True, - description=( - "HTTP health check before benchmarks (same semantics as openai_http)." - ), - ) - extras: dict[str, Any] | None = Field( - default=None, - description="Extra fields merged into session.update (backend model wins).", - ) - - -@Backend.register("openai_realtime_ws") -class OpenAIRealtimeWebSocketBackend(Backend): - """WebSocket client for realtime (streaming) audio transcription.""" - - @classmethod - def backend_args(cls) -> type[BackendArgs]: - return OpenAIRealtimeWsBackendArgs - - def __init__( - self, - target: str, - model: str = "", - websocket_path: str = "/v1/realtime", - chunk_samples: int = 3200, - api_key: str | None = None, - verify: bool = False, - timeout: float | None = _DEFAULT_WS_RECV_TIMEOUT, - timeout_connect: float = FALLBACK_TIMEOUT, - validate_backend: bool | str | dict[str, Any] = True, - extras: dict[str, Any] | None = None, - ): - super().__init__(type_="openai_realtime_ws") - self.target = target.rstrip("/").removesuffix("/v1") - self.model = model or "" - self.websocket_path = websocket_path - self.chunk_samples = chunk_samples - self.api_key = api_key - self.verify = verify - self.timeout = timeout - self.timeout_connect = timeout_connect - self.api_routes = _WS_API_ROUTES - self.validate_backend: dict[str, Any] | None = self._resolve_validate_kwargs( - validate_backend - ) - self.extras = extras or {} - self._in_process = False - self._async_client: httpx.AsyncClient | None = None - - @property - def info(self) -> dict[str, Any]: - return { - "target": self.target, - "model": self.model, - "websocket_path": self.websocket_path, - "chunk_samples": self.chunk_samples, - "timeout": self.timeout, - "timeout_connect": self.timeout_connect, - "verify": self.verify, - "validate_backend": self.validate_backend, - } - - def _parsed_target(self) -> ParseResult: - raw = self.target if "://" in self.target else f"http://{self.target}" - return urlparse(raw) - - def _ws_url(self) -> str: - parsed = self._parsed_target() - if not parsed.netloc: - raise ValueError(f"Invalid target URL for WebSocket: {self.target!r}") - ws_scheme = "wss" if parsed.scheme in ("https", "wss") else "ws" - path = self.websocket_path - if not path.startswith("/"): - path = f"/{path}" - return f"{ws_scheme}://{parsed.netloc}{path}" - - def _ssl_context(self) -> ssl.SSLContext | None: - if self._parsed_target().scheme in ("http", "ws"): - return None - ctx = ssl.create_default_context() - if not self.verify: - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - return ctx - - def _resolve_validate_kwargs( - self, validate_backend: bool | str | dict[str, Any] - ) -> dict[str, Any] | None: - return resolve_openai_validate_kwargs( - validate_backend, self.target, self.api_routes - ) - - def _build_headers( - self, existing_headers: dict[str, str] | None = None - ) -> dict[str, str] | None: - return build_openai_headers(self.api_key, existing_headers) - - async def process_startup(self) -> None: - if self._in_process: - raise RuntimeError("Backend already started up for process.") - self._async_client = httpx.AsyncClient( - timeout=httpx.Timeout( - FALLBACK_TIMEOUT, - read=self.timeout, - connect=self.timeout_connect, - ), - verify=self.verify, - limits=httpx.Limits( - max_connections=None, - max_keepalive_connections=None, - keepalive_expiry=5.0, - ), - ) - self._in_process = True - - async def process_shutdown(self) -> None: - if not self._in_process: - raise RuntimeError("Backend not started up for process.") - client = self._async_client - if client is None: - raise RuntimeError("Backend not started up for process.") - await client.aclose() - self._async_client = None - self._in_process = False - - async def validate(self) -> None: - if self._async_client is None: - raise RuntimeError("Backend not started up for process.") - if not self.validate_backend: - return - validate_kwargs = {**self.validate_backend} - existing_headers = validate_kwargs.get("headers") - validate_kwargs["headers"] = self._build_headers(existing_headers) - try: - response = await self._async_client.request(**validate_kwargs) - response.raise_for_status() - except Exception as exc: - raise RuntimeError( - "Backend validation request failed. Could not connect to the server " - "or validate the backend configuration." - ) from exc - - async def available_models(self) -> list[str]: - if self._async_client is None: - raise RuntimeError("Backend not started up for process.") - target = f"{self.target}/v1/models" - response = await self._async_client.get(target, headers=self._build_headers()) - response.raise_for_status() - try: - payload: Any = response.json() - except json.JSONDecodeError as exc: - raise RuntimeError( - "Unexpected /v1/models response: body is not valid JSON" - ) from exc - return _model_ids_from_openai_models_payload(payload) - - async def default_model(self) -> str: - if self.model: - return self.model - if not self._in_process: - return "" - models = await self.available_models() - self.model = models[0] if models else "" - return self.model - - async def resolve( # type: ignore[override, misc] # noqa: C901, PLR0912, PLR0915 - self, - request: GenerationRequest, - request_info: RequestInfo, - history: list[tuple[GenerationRequest, GenerationResponse | None]] - | None = None, - ) -> AsyncIterator[tuple[GenerationResponse | None, RequestInfo]]: - if self._async_client is None: - raise RuntimeError("Backend not started up for process.") - if history: - raise NotImplementedError( - "openai_realtime_ws does not support multiturn/history yet." - ) - - audio_columns = request.columns.get("audio_column", []) - if len(audio_columns) != 1: - raise ValueError( - "Realtime transcription expects exactly one audio_column entry; " - f"got {len(audio_columns)}." - ) - - model_name = await self.default_model() - if not str(model_name).strip(): - raise RuntimeError( - "No model configured for openai_realtime_ws and /v1/models returned " - "none. Pass --model or ensure the server lists at least one model." - ) - - arguments = GenerationRequestArguments( - body={ - "model": model_name, - "websocket_path": self.websocket_path, - "chunk_samples": self.chunk_samples, - } - ) - - pcm_fn = _ensure_pcm16_append_b64_chunks() - chunks = pcm_fn( - audio_columns[0], - chunk_samples=self.chunk_samples, - ) - - session_update: dict[str, Any] = {"type": "session.update"} - if self.extras: - for key, val in self.extras.items(): - if key not in ("type", "model"): - session_update[key] = val - session_update["model"] = model_name - - ssl_ctx = self._ssl_context() - ws_headers = self._build_headers() - audio_handler = AudioRequestHandler() - full_text_parts: list[str] = [] - - try: - request_info.timings.request_start = time.time() - connect_kw: dict[str, Any] = { - "ssl": ssl_ctx, - "open_timeout": self.timeout_connect, - } - if ws_headers: - connect_kw["additional_headers"] = ws_headers - ws_connect = _require_ws_connect() - async with ws_connect(self._ws_url(), **connect_kw) as ws: - raw_first = await self._recv_ws(ws) - first_event = _load_ws_event(raw_first) - if first_event.get("type") == "error": - raise RuntimeError(_ws_error_message(first_event.get("error"))) - if first_event.get("type") != "session.created": - raise RuntimeError( - f"Expected session.created, got {first_event.get('type')!r}" - ) - await ws.send(json.dumps(session_update)) - for b64_chunk in chunks: - await ws.send( - json.dumps( - {"type": "input_audio_buffer.append", "audio": b64_chunk} - ) - ) - await ws.send( - json.dumps({"type": "input_audio_buffer.commit", "final": False}) - ) - # Sentinel end-of-stream for vLLM's audio queue - # (see RealtimeConnection). - await ws.send( - json.dumps({"type": "input_audio_buffer.commit", "final": True}) - ) - - ignored_events = 0 - while True: - raw = await self._recv_ws(ws) - event = _load_ws_event(raw) - et = event.get("type") - if et == "transcription.delta": - iter_time = time.time() - if request_info.timings.first_request_iteration is None: - request_info.timings.first_request_iteration = iter_time - request_info.timings.last_request_iteration = iter_time - request_info.timings.request_iterations += 1 - delta = event.get("delta") or "" - full_text_parts.append(delta) - if request_info.timings.first_token_iteration is None: - request_info.timings.first_token_iteration = iter_time - request_info.timings.token_iterations = 0 - yield None, request_info - request_info.timings.last_token_iteration = iter_time - request_info.timings.token_iterations += 1 if delta else 0 - - elif et == "transcription.done": - iter_time = time.time() - request_info.timings.request_end = iter_time - full_text = event.get("text") or "".join(full_text_parts) - if request_info.timings.first_token_iteration is None: - if request_info.timings.first_request_iteration is None: - request_info.timings.first_request_iteration = iter_time - request_info.timings.last_request_iteration = iter_time - request_info.timings.request_iterations += 1 - request_info.timings.first_token_iteration = iter_time - request_info.timings.token_iterations = 0 - yield None, request_info - request_info.timings.last_token_iteration = iter_time - request_info.timings.token_iterations += ( - 1 if full_text else 0 - ) - usage_dict = _normalize_transcription_usage(event.get("usage")) - inp, outp = audio_handler.extract_metrics(usage_dict, full_text) - yield ( - GenerationResponse( - request_id=request.request_id, - request_args=arguments.model_dump_json(), - text=full_text, - input_metrics=inp, - output_metrics=outp, - ), - request_info, - ) - break - elif et == "error": - raise RuntimeError(_ws_error_message(event.get("error"))) - else: - ignored_events += 1 - if ignored_events > _MAX_IGNORED_WS_EVENT_TYPES: - raise RuntimeError( - "Exceeded maximum ignored realtime WebSocket events " - f"without transcription.done (last type={et!r})." - ) - continue - - except asyncio.CancelledError as err: - text_so_far = "".join(full_text_parts) - inp, outp = audio_handler.extract_metrics(None, text_so_far or "") - yield ( - GenerationResponse( - request_id=request.request_id, - request_args=arguments.model_dump_json(), - text=text_so_far, - input_metrics=inp, - output_metrics=outp, - ), - request_info, - ) - raise err - finally: - if ( - request_info.timings.request_start is not None - and request_info.timings.request_end is None - ): - request_info.timings.request_end = time.time() - - async def _recv_ws(self, ws: ClientConnection) -> str: - if self.timeout is None: - msg = await ws.recv() - else: - msg = await asyncio.wait_for(ws.recv(), timeout=self.timeout) - if isinstance(msg, bytes): - return msg.decode() - return str(msg) diff --git a/tests/e2e/test_realtime_ws_e2e.py b/tests/e2e/test_realtime_ws_e2e.py index 160f76ca0..db8cfa125 100644 --- a/tests/e2e/test_realtime_ws_e2e.py +++ b/tests/e2e/test_realtime_ws_e2e.py @@ -23,7 +23,7 @@ allow_module_level=True, ) -from guidellm.backends.openai.realtime_ws import OpenAIRealtimeWebSocketBackend +from guidellm.backends.openai.websocket import OpenAIWebSocketBackend from guidellm.schemas import GenerationRequest, RequestInfo, RequestTimings @@ -120,7 +120,7 @@ async def test_realtime_ws_full_stack_in_one_event_loop( stub = make_realtime_transcription_stub_handler(session_id="e2e-stub-sess") async with serve(stub, "127.0.0.1", port): - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="stub-model", validate_backend=False, diff --git a/tests/unit/backends/openai/test_realtime_ws.py b/tests/unit/backends/openai/test_realtime_ws.py index b00821d53..8c7d7c597 100644 --- a/tests/unit/backends/openai/test_realtime_ws.py +++ b/tests/unit/backends/openai/test_realtime_ws.py @@ -1,4 +1,4 @@ -"""Tests for OpenAIRealtimeWebSocketBackend. +"""Tests for OpenAIWebSocketBackend. ## WRITTEN BY AI ## """ @@ -20,10 +20,10 @@ allow_module_level=True, ) -from guidellm.backends.openai.realtime_ws import ( +from guidellm.backends.openai.websocket import ( _DEFAULT_WS_RECV_TIMEOUT, - OpenAIRealtimeWebSocketBackend, - OpenAIRealtimeWsBackendArgs, + OpenAIWebSocketBackend, + OpenAIWebSocketBackendArgs, ) from guidellm.schemas import GenerationRequest, RequestInfo, RequestTimings @@ -67,13 +67,13 @@ async def handler(ws: object) -> None: ) monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YWFhYQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="test-model", validate_backend=False, @@ -135,13 +135,13 @@ async def handler(ws: object) -> None: ) monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YWFhYQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="test-model", validate_backend=False, @@ -205,13 +205,13 @@ async def handler(ws: object) -> None: ) monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YWFhYQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="test-model", validate_backend=False, @@ -257,13 +257,13 @@ async def handler(ws: object) -> None: return monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -285,7 +285,7 @@ async def test_first_message_error_event_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) @@ -294,7 +294,7 @@ async def handler(ws: object) -> None: async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -316,7 +316,7 @@ async def test_first_message_not_session_created_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) @@ -325,7 +325,7 @@ async def handler(ws: object) -> None: async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -358,13 +358,13 @@ async def handler(ws: object) -> None: await ws.send("{not-json") monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -383,7 +383,7 @@ async def handler(ws: object) -> None: @pytest.mark.asyncio async def test_resolve_requires_process_startup() -> None: - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target="http://127.0.0.1:9", model="m", validate_backend=False, @@ -400,7 +400,7 @@ async def test_resolve_requires_process_startup() -> None: @pytest.mark.asyncio async def test_resolve_rejects_history() -> None: - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target="http://127.0.0.1:9", model="m", validate_backend=False, @@ -420,7 +420,7 @@ async def test_resolve_rejects_history() -> None: @pytest.mark.asyncio async def test_resolve_rejects_wrong_audio_column_count() -> None: - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target="http://127.0.0.1:9", model="m", validate_backend=False, @@ -463,14 +463,14 @@ async def handler(ws: object) -> None: await _bounded_ws_recv(ws) monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) results: list = [] async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -514,13 +514,13 @@ async def handler(ws: object) -> None: await ws.send("[]") monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -542,7 +542,7 @@ async def test_excessive_ignored_events_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws._MAX_IGNORED_WS_EVENT_TYPES", + "guidellm.backends.openai.websocket._MAX_IGNORED_WS_EVENT_TYPES", 2, ) @@ -557,13 +557,13 @@ async def handler(ws: object) -> None: await ws.send(json.dumps({"type": "noise.event"})) monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -586,7 +586,7 @@ async def test_available_models_parses_response(httpx_mock: object) -> None: url="http://127.0.0.1:9/v1/models", json={"data": [{"id": "a"}, {"id": "b"}]}, ) - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target="http://127.0.0.1:9", validate_backend=False, ) @@ -601,7 +601,7 @@ async def test_available_models_bad_data_shape_raises(httpx_mock: object) -> Non url="http://127.0.0.1:9/v1/models", json={"data": "not-a-list"}, ) - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target="http://127.0.0.1:9", validate_backend=False, ) @@ -621,10 +621,10 @@ async def test_resolve_raises_when_no_model_and_empty_catalog( json={"data": []}, ) monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target="http://127.0.0.1:9", model="", validate_backend=False, @@ -646,10 +646,10 @@ async def test_resolve_invalid_ws_target_url_raises( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target="", model="m", validate_backend=False, @@ -681,13 +681,13 @@ async def handler(ws: object) -> None: ) monkeypatch.setattr( - "guidellm.backends.openai.realtime_ws.pcm16_append_b64_chunks", + "guidellm.backends.openai.websocket.pcm16_append_b64_chunks", lambda *a, **k: ["YQ=="], ) async with serve(handler, "127.0.0.1", 0) as server: port = server.sockets[0].getsockname()[1] - be = OpenAIRealtimeWebSocketBackend( + be = OpenAIWebSocketBackend( target=f"http://127.0.0.1:{port}", model="m", validate_backend=False, @@ -705,7 +705,7 @@ async def handler(ws: object) -> None: def test_openai_realtime_backend_args_model() -> None: - a = OpenAIRealtimeWsBackendArgs(target="http://localhost:8000", model="x") + a = OpenAIWebSocketBackendArgs(target="http://localhost:8000", model="x") assert a.websocket_path == "/v1/realtime" assert a.chunk_samples == 3200 assert a.timeout == _DEFAULT_WS_RECV_TIMEOUT diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py index 13c707496..d4d6744e5 100644 --- a/tests/unit/backends/test_backend.py +++ b/tests/unit/backends/test_backend.py @@ -324,15 +324,15 @@ def test_openai_realtime_ws_backend_registered(self): ## WRITTEN BY AI ## """ from guidellm.backends.openai import ( - OpenAIRealtimeWebSocketBackend, - OpenAIRealtimeWsBackendArgs, + OpenAIWebSocketBackend, + OpenAIWebSocketBackendArgs, ) assert Backend.is_registered("openai_realtime_ws") realtime_args = Backend.get_backend_args("openai_realtime_ws") - assert realtime_args is OpenAIRealtimeWsBackendArgs + assert realtime_args is OpenAIWebSocketBackendArgs backend = Backend.create("openai_realtime_ws", target="http://localhost:9000") - assert isinstance(backend, OpenAIRealtimeWebSocketBackend) + assert isinstance(backend, OpenAIWebSocketBackend) assert backend.type_ == "openai_realtime_ws" def test_openai_backend_registered(self): diff --git a/tests/unit/benchmark/schemas/generative/test_entrypoints.py b/tests/unit/benchmark/schemas/generative/test_entrypoints.py index b251bb096..099d0325f 100644 --- a/tests/unit/benchmark/schemas/generative/test_entrypoints.py +++ b/tests/unit/benchmark/schemas/generative/test_entrypoints.py @@ -12,7 +12,7 @@ from guidellm.backends.backend import BackendArgs from guidellm.backends.openai.http import OpenAIHttpBackendArgs -from guidellm.backends.openai.realtime_ws import OpenAIRealtimeWsBackendArgs +from guidellm.backends.openai.websocket import OpenAIWebSocketBackendArgs from guidellm.benchmark.schemas.generative.entrypoints import ( BenchmarkGenerativeTextArgs, ) @@ -69,7 +69,7 @@ def test_openai_realtime_ws_backend_kwargs_validates(self) -> None: } ) assert args.backend == "openai_realtime_ws" - assert isinstance(args.backend_kwargs, OpenAIRealtimeWsBackendArgs) + assert isinstance(args.backend_kwargs, OpenAIWebSocketBackendArgs) assert args.backend_kwargs.target == "http://localhost:8000" assert args.backend_kwargs.model == "rt-model" From 55cd5818f4bb69361a34b01991e497f614be73a6 Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 17:35:47 +0300 Subject: [PATCH 07/12] CR Signed-off-by: Uri Shaket --- src/guidellm/backends/openai/common.py | 74 +++ src/guidellm/backends/openai/websocket.py | 567 ++++++++++++++++++++++ 2 files changed, 641 insertions(+) create mode 100644 src/guidellm/backends/openai/common.py create mode 100644 src/guidellm/backends/openai/websocket.py diff --git a/src/guidellm/backends/openai/common.py b/src/guidellm/backends/openai/common.py new file mode 100644 index 000000000..73e489e2b --- /dev/null +++ b/src/guidellm/backends/openai/common.py @@ -0,0 +1,74 @@ +"""Shared helpers for OpenAI-compatible HTTP and WebSocket backends.""" + +from __future__ import annotations + +from typing import Any + +__all__ = [ + "FALLBACK_TIMEOUT", + "build_headers", + "resolve_validate_kwargs", +] + +# NOTE: This value is taken from httpx's default +FALLBACK_TIMEOUT = 5.0 + + +def build_headers( + api_key: str | None, + existing_headers: dict[str, str] | None = None, +) -> dict[str, str] | None: + """ + Build headers with bearer authentication for OpenAI-compatible requests. + + Merges the Authorization bearer token (if ``api_key`` is set) with any + existing headers. User-provided headers take precedence over the bearer token. + + :param api_key: Optional API key for Bearer authentication + :param existing_headers: Optional headers to merge in + :return: Headers dict, or ``None`` if there are no headers to send + """ + headers: dict[str, str] = {} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + if existing_headers: + headers = {**headers, **existing_headers} + return headers or None + + +def resolve_validate_kwargs( + validate_backend: bool | str | dict[str, Any], + target: str, + api_routes: dict[str, str], +) -> dict[str, Any] | None: + """ + Normalize ``validate_backend`` into kwargs for ``httpx`` request(). + + Accepts the same shapes as + :class:`~guidellm.backends.openai.http.OpenAIHTTPBackend`. + """ + raw = validate_backend + if not raw: + return None + + if raw is True: + raw = "/health" + + if isinstance(raw, str): + url = f"{target}/{api_routes[raw]}" if raw in api_routes else raw + request_kwargs: dict[str, Any] = {"method": "GET", "url": url} + elif isinstance(raw, dict): + request_kwargs = raw + else: + request_kwargs = raw + + if not isinstance(request_kwargs, dict) or "url" not in request_kwargs: + raise ValueError( + "validate_backend must be a boolean, string, or dictionary and contain " + f"a target URL. Got: {request_kwargs}" + ) + + if "method" not in request_kwargs: + request_kwargs["method"] = "GET" + + return request_kwargs diff --git a/src/guidellm/backends/openai/websocket.py b/src/guidellm/backends/openai/websocket.py new file mode 100644 index 000000000..dc0d0f224 --- /dev/null +++ b/src/guidellm/backends/openai/websocket.py @@ -0,0 +1,567 @@ +""" +WebSocket backend for vLLM-compatible realtime audio transcription. + +Implements the JSON event protocol used by vLLM's ``/v1/realtime`` endpoint: +``session.created`` → ``session.update`` → ``input_audio_buffer.append`` → +``input_audio_buffer.commit`` (``final: false`` starts transcription, then +``final: true`` ends the audio stream) → ``transcription.delta`` / +``transcription.done``. +""" + +from __future__ import annotations + +import asyncio +import json +import ssl +import time +from collections.abc import AsyncIterator +from typing import TYPE_CHECKING, Any +from urllib.parse import ParseResult, urlparse + +import httpx +from pydantic import Field + +if TYPE_CHECKING: + from websockets.asyncio.client import ClientConnection + +from guidellm.backends.backend import Backend, BackendArgs +from guidellm.backends.openai.common import ( + FALLBACK_TIMEOUT, + build_headers, + resolve_validate_kwargs, +) +from guidellm.backends.openai.request_handlers import AudioRequestHandler +from guidellm.schemas import ( + GenerationRequest, + GenerationRequestArguments, + GenerationResponse, + RequestInfo, +) + +__all__ = [ + "OpenAIWebSocketBackend", + "OpenAIWebSocketBackendArgs", +] + +_WS_API_ROUTES = { + "/health": "health", + "/v1/models": "v1/models", +} + +# Guard against a misbehaving server that only emits ignored event types. +_MAX_IGNORED_WS_EVENT_TYPES = 50_000 + +# Per-message WebSocket recv timeout default so benchmark workers do not hang forever +# on a silent peer. Pass ``timeout=None`` to wait indefinitely. +_DEFAULT_WS_RECV_TIMEOUT = 120.0 + +_AUDIO_EXTRA_HINT = ( + "Install optional audio extras: pip install 'guidellm[audio]' " + "(includes websockets and torchcodec for realtime transcription)." +) + + +def _require_ws_connect() -> Any: + try: + from websockets.asyncio.client import connect as ws_connect + except ImportError as exc: + raise ImportError( + "The openai_realtime_ws backend requires the 'websockets' package. " + + _AUDIO_EXTRA_HINT + ) from exc + return ws_connect + + +def _ws_error_message(err: Any) -> str: + """Format WebSocket ``error`` for exceptions (supports dict payloads).""" + if isinstance(err, dict): + msg = err.get("message") or err.get("msg") + code = err.get("code") + parts = [str(p) for p in (code, msg) if p] + if parts: + return ": ".join(parts) + try: + return json.dumps(err)[:500] + except (TypeError, ValueError): + return repr(err) + if err is None or err == "": + return "WebSocket error" + return str(err) + + +def _model_ids_from_openai_models_payload(payload: Any) -> list[str]: + """Parse ``GET /v1/models`` JSON body; raise RuntimeError if shape is unexpected.""" + if not isinstance(payload, dict): + raise RuntimeError( + "Unexpected /v1/models response: top-level JSON must be an object, " + f"got {type(payload).__name__}" + ) + data = payload.get("data") + if not isinstance(data, list): + raise RuntimeError( + "Unexpected /v1/models response: 'data' must be a list, " + f"got {type(data).__name__}" + ) + ids: list[str] = [] + for i, item in enumerate(data): + if not isinstance(item, dict) or "id" not in item: + raise RuntimeError( + "Unexpected /v1/models response: each entry must be an object with " + f"'id' (index {i})" + ) + ids.append(str(item["id"])) + return ids + + +def _load_ws_event(raw: str) -> dict[str, Any]: + """Parse a JSON WebSocket text frame; raise RuntimeError on invalid JSON.""" + try: + parsed: Any = json.loads(raw) + except json.JSONDecodeError as exc: + raise RuntimeError( + f"Invalid JSON from realtime WebSocket: {exc.msg} at position {exc.pos}" + ) from exc + if not isinstance(parsed, dict): + raise RuntimeError( + f"Expected JSON object from realtime WebSocket, got {type(parsed).__name__}" + ) + return parsed + + +# Lazy import cache (no ``global``); tests may set ``pcm16_append_b64_chunks`` directly. +pcm16_append_b64_chunks: Any = None +_pcm_imported_fn: dict[str, Any] = {"fn": None} + + +def _ensure_pcm16_append_b64_chunks() -> Any: + if pcm16_append_b64_chunks is not None: + return pcm16_append_b64_chunks + if _pcm_imported_fn["fn"] is not None: + return _pcm_imported_fn["fn"] + try: + from guidellm.extras.audio import pcm16_append_b64_chunks as fn + except ImportError as exc: + raise ImportError( + "The openai_realtime_ws backend requires the audio extras for PCM " + "handling used in realtime transcription. " + _AUDIO_EXTRA_HINT + ) from exc + _pcm_imported_fn["fn"] = fn + return fn + + +def _coerce_usage_int(value: Any) -> int | None: + if isinstance(value, bool): + return None + if isinstance(value, int | float): + return int(value) + if isinstance(value, str): + stripped = value.strip() + if not stripped: + return None + try: + return int(stripped) + except ValueError: + return None + return None + + +def _normalize_transcription_usage( + raw_usage: Any, +) -> dict[str, int | dict[str, int]] | None: + """Coerce OpenAI-style usage dict values to ints (including numeric strings).""" + if not isinstance(raw_usage, dict): + return None + result: dict[str, int | dict[str, int]] = {} + for key, val in raw_usage.items(): + if isinstance(val, dict): + inner: dict[str, int] = {} + for ik, iv in val.items(): + num = _coerce_usage_int(iv) + if num is not None: + inner[ik] = num + if inner: + result[key] = inner + else: + num = _coerce_usage_int(val) + if num is not None: + result[key] = num + return result if result else None + + +class OpenAIWebSocketBackendArgs(BackendArgs): + """Arguments for creating the realtime WebSocket backend.""" + + target: str = Field( + description=( + "HTTP(S) base URL of the server (WebSocket URL is derived from it)." + ), + json_schema_extra={ + "error_message": ( + "Backend '{backend_type}' requires --target with a valid URL." + ) + }, + ) + model: str | None = Field( + default=None, + description="Model identifier (required unless discoverable from /v1/models).", + ) + websocket_path: str = Field( + default="/v1/realtime", + description="WebSocket path on the server (default /v1/realtime).", + ) + chunk_samples: int = Field( + default=3200, + ge=1, + description="PCM16 frames per input_audio_buffer.append chunk (16 kHz).", + ) + api_key: str | None = Field(default=None, description="Bearer token if required.") + verify: bool = Field(default=False, description="Verify TLS certificates.") + timeout: float | None = Field( + default=_DEFAULT_WS_RECV_TIMEOUT, + description=( + "Per-message read timeout for WebSocket receives (seconds). " + f"Defaults to {_DEFAULT_WS_RECV_TIMEOUT}s so hung servers do not block " + "workers; use ``None`` for no limit." + ), + ) + timeout_connect: float = Field( + default=FALLBACK_TIMEOUT, + description="Timeout for establishing the WebSocket connection.", + ) + validate_backend: bool | str | dict[str, Any] = Field( + default=True, + description=( + "HTTP health check before benchmarks (same semantics as openai_http)." + ), + ) + extras: dict[str, Any] | None = Field( + default=None, + description="Extra fields merged into session.update (backend model wins).", + ) + + +@Backend.register("openai_realtime_ws") +class OpenAIWebSocketBackend(Backend): + """WebSocket client for realtime (streaming) audio transcription.""" + + @classmethod + def backend_args(cls) -> type[BackendArgs]: + return OpenAIWebSocketBackendArgs + + def __init__( + self, + target: str, + model: str = "", + websocket_path: str = "/v1/realtime", + chunk_samples: int = 3200, + api_key: str | None = None, + verify: bool = False, + timeout: float | None = _DEFAULT_WS_RECV_TIMEOUT, + timeout_connect: float = FALLBACK_TIMEOUT, + validate_backend: bool | str | dict[str, Any] = True, + extras: dict[str, Any] | None = None, + ): + super().__init__(type_="openai_realtime_ws") + self.target = target.rstrip("/").removesuffix("/v1") + self.model = model or "" + self.websocket_path = websocket_path + self.chunk_samples = chunk_samples + self.api_key = api_key + self.verify = verify + self.timeout = timeout + self.timeout_connect = timeout_connect + self.api_routes = _WS_API_ROUTES + self.validate_backend: dict[str, Any] | None = resolve_validate_kwargs( + validate_backend + ) + self.extras = extras or {} + self._in_process = False + self._async_client: httpx.AsyncClient | None = None + + @property + def info(self) -> dict[str, Any]: + return { + "target": self.target, + "model": self.model, + "websocket_path": self.websocket_path, + "chunk_samples": self.chunk_samples, + "timeout": self.timeout, + "timeout_connect": self.timeout_connect, + "verify": self.verify, + "validate_backend": self.validate_backend, + } + + def _parsed_target(self) -> ParseResult: + raw = self.target if "://" in self.target else f"http://{self.target}" + return urlparse(raw) + + def _ws_url(self) -> str: + parsed = self._parsed_target() + if not parsed.netloc: + raise ValueError(f"Invalid target URL for WebSocket: {self.target!r}") + ws_scheme = "wss" if parsed.scheme in ("https", "wss") else "ws" + path = self.websocket_path + if not path.startswith("/"): + path = f"/{path}" + return f"{ws_scheme}://{parsed.netloc}{path}" + + def _ssl_context(self) -> ssl.SSLContext | None: + if self._parsed_target().scheme in ("http", "ws"): + return None + ctx = ssl.create_default_context() + if not self.verify: + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + return ctx + + + def _build_headers( + self, existing_headers: dict[str, str] | None = None + ) -> dict[str, str] | None: + return build_headers(self.api_key, existing_headers) + + async def process_startup(self) -> None: + if self._in_process: + raise RuntimeError("Backend already started up for process.") + self._async_client = httpx.AsyncClient( + timeout=httpx.Timeout( + FALLBACK_TIMEOUT, + read=self.timeout, + connect=self.timeout_connect, + ), + verify=self.verify, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + keepalive_expiry=5.0, + ), + ) + self._in_process = True + + async def process_shutdown(self) -> None: + if not self._in_process: + raise RuntimeError("Backend not started up for process.") + client = self._async_client + if client is None: + raise RuntimeError("Backend not started up for process.") + await client.aclose() + self._async_client = None + self._in_process = False + + async def validate(self) -> None: + if self._async_client is None: + raise RuntimeError("Backend not started up for process.") + if not self.validate_backend: + return + validate_kwargs = {**self.validate_backend} + existing_headers = validate_kwargs.get("headers") + validate_kwargs["headers"] = build_headers(existing_headers) + try: + response = await self._async_client.request(**validate_kwargs) + response.raise_for_status() + except Exception as exc: + raise RuntimeError( + "Backend validation request failed. Could not connect to the server " + "or validate the backend configuration." + ) from exc + + async def available_models(self) -> list[str]: + if self._async_client is None: + raise RuntimeError("Backend not started up for process.") + target = f"{self.target}/v1/models" + response = await self._async_client.get(target, headers=build_headers(self.api_key)) + response.raise_for_status() + try: + payload: Any = response.json() + except json.JSONDecodeError as exc: + raise RuntimeError( + "Unexpected /v1/models response: body is not valid JSON" + ) from exc + return _model_ids_from_openai_models_payload(payload) + + async def default_model(self) -> str: + if self.model: + return self.model + if not self._in_process: + return "" + models = await self.available_models() + self.model = models[0] if models else "" + return self.model + + async def resolve( # type: ignore[override, misc] # noqa: C901, PLR0912, PLR0915 + self, + request: GenerationRequest, + request_info: RequestInfo, + history: list[tuple[GenerationRequest, GenerationResponse | None]] + | None = None, + ) -> AsyncIterator[tuple[GenerationResponse | None, RequestInfo]]: + if self._async_client is None: + raise RuntimeError("Backend not started up for process.") + if history: + raise NotImplementedError( + "openai_realtime_ws does not support multiturn/history yet." + ) + + audio_columns = request.columns.get("audio_column", []) + if len(audio_columns) != 1: + raise ValueError( + "Realtime transcription expects exactly one audio_column entry; " + f"got {len(audio_columns)}." + ) + + model_name = await self.default_model() + if not str(model_name).strip(): + raise RuntimeError( + "No model configured for openai_realtime_ws and /v1/models returned " + "none. Pass --model or ensure the server lists at least one model." + ) + + arguments = GenerationRequestArguments( + body={ + "model": model_name, + "websocket_path": self.websocket_path, + "chunk_samples": self.chunk_samples, + } + ) + + pcm_fn = _ensure_pcm16_append_b64_chunks() + chunks = pcm_fn( + audio_columns[0], + chunk_samples=self.chunk_samples, + ) + + session_update: dict[str, Any] = {"type": "session.update"} + if self.extras: + for key, val in self.extras.items(): + if key not in ("type", "model"): + session_update[key] = val + session_update["model"] = model_name + + ssl_ctx = self._ssl_context() + ws_headers = build_headers(self.api_key) + audio_handler = AudioRequestHandler() + full_text_parts: list[str] = [] + + try: + request_info.timings.request_start = time.time() + connect_kw: dict[str, Any] = { + "ssl": ssl_ctx, + "open_timeout": self.timeout_connect, + } + if ws_headers: + connect_kw["additional_headers"] = ws_headers + ws_connect = _require_ws_connect() + async with ws_connect(self._ws_url(), **connect_kw) as ws: + raw_first = await self._recv_ws(ws) + first_event = _load_ws_event(raw_first) + if first_event.get("type") == "error": + raise RuntimeError(_ws_error_message(first_event.get("error"))) + if first_event.get("type") != "session.created": + raise RuntimeError( + f"Expected session.created, got {first_event.get('type')!r}" + ) + await ws.send(json.dumps(session_update)) + for b64_chunk in chunks: + await ws.send( + json.dumps( + {"type": "input_audio_buffer.append", "audio": b64_chunk} + ) + ) + await ws.send( + json.dumps({"type": "input_audio_buffer.commit", "final": False}) + ) + # Sentinel end-of-stream for vLLM's audio queue + # (see RealtimeConnection). + await ws.send( + json.dumps({"type": "input_audio_buffer.commit", "final": True}) + ) + + ignored_events = 0 + while True: + raw = await self._recv_ws(ws) + event = _load_ws_event(raw) + et = event.get("type") + if et == "transcription.delta": + iter_time = time.time() + if request_info.timings.first_request_iteration is None: + request_info.timings.first_request_iteration = iter_time + request_info.timings.last_request_iteration = iter_time + request_info.timings.request_iterations += 1 + delta = event.get("delta") or "" + full_text_parts.append(delta) + if request_info.timings.first_token_iteration is None: + request_info.timings.first_token_iteration = iter_time + request_info.timings.token_iterations = 0 + yield None, request_info + request_info.timings.last_token_iteration = iter_time + request_info.timings.token_iterations += 1 if delta else 0 + + elif et == "transcription.done": + iter_time = time.time() + request_info.timings.request_end = iter_time + full_text = event.get("text") or "".join(full_text_parts) + if request_info.timings.first_token_iteration is None: + if request_info.timings.first_request_iteration is None: + request_info.timings.first_request_iteration = iter_time + request_info.timings.last_request_iteration = iter_time + request_info.timings.request_iterations += 1 + request_info.timings.first_token_iteration = iter_time + request_info.timings.token_iterations = 0 + yield None, request_info + request_info.timings.last_token_iteration = iter_time + request_info.timings.token_iterations += ( + 1 if full_text else 0 + ) + usage_dict = _normalize_transcription_usage(event.get("usage")) + inp, outp = audio_handler.extract_metrics(usage_dict, full_text) + yield ( + GenerationResponse( + request_id=request.request_id, + request_args=arguments.model_dump_json(), + text=full_text, + input_metrics=inp, + output_metrics=outp, + ), + request_info, + ) + break + elif et == "error": + raise RuntimeError(_ws_error_message(event.get("error"))) + else: + ignored_events += 1 + if ignored_events > _MAX_IGNORED_WS_EVENT_TYPES: + raise RuntimeError( + "Exceeded maximum ignored realtime WebSocket events " + f"without transcription.done (last type={et!r})." + ) + continue + + except asyncio.CancelledError as err: + text_so_far = "".join(full_text_parts) + inp, outp = audio_handler.extract_metrics(None, text_so_far or "") + yield ( + GenerationResponse( + request_id=request.request_id, + request_args=arguments.model_dump_json(), + text=text_so_far, + input_metrics=inp, + output_metrics=outp, + ), + request_info, + ) + raise err + finally: + if ( + request_info.timings.request_start is not None + and request_info.timings.request_end is None + ): + request_info.timings.request_end = time.time() + + async def _recv_ws(self, ws: ClientConnection) -> str: + if self.timeout is None: + msg = await ws.recv() + else: + msg = await asyncio.wait_for(ws.recv(), timeout=self.timeout) + if isinstance(msg, bytes): + return msg.decode() + return str(msg) From fc4ee667e0fbfbf10a67461a555e4b9181a6d452 Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Mon, 4 May 2026 17:49:56 +0300 Subject: [PATCH 08/12] CR Signed-off-by: Uri Shaket --- src/guidellm/backends/__init__.py | 8 ++++---- src/guidellm/backends/openai/__init__.py | 6 +++--- src/guidellm/backends/openai/http.py | 5 +++-- src/guidellm/backends/openai/websocket.py | 9 ++++++--- uv.lock | 2 +- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/guidellm/backends/__init__.py b/src/guidellm/backends/__init__.py index 812e70f5b..1c7e48b92 100644 --- a/src/guidellm/backends/__init__.py +++ b/src/guidellm/backends/__init__.py @@ -16,10 +16,10 @@ AudioRequestHandler, ChatCompletionsRequestHandler, OpenAIHTTPBackend, - OpenAIWebSocketBackend, - OpenAIWebSocketBackendArgs, OpenAIRequestHandler, OpenAIRequestHandlerFactory, + OpenAIWebSocketBackend, + OpenAIWebSocketBackendArgs, TextCompletionsRequestHandler, ) @@ -37,10 +37,10 @@ "BackendType", "ChatCompletionsRequestHandler", "OpenAIHTTPBackend", - "OpenAIWebSocketBackend", - "OpenAIWebSocketBackendArgs", "OpenAIRequestHandler", "OpenAIRequestHandlerFactory", + "OpenAIWebSocketBackend", + "OpenAIWebSocketBackendArgs", "TextCompletionsRequestHandler", ] diff --git a/src/guidellm/backends/openai/__init__.py b/src/guidellm/backends/openai/__init__.py index fe781e2bc..30c2211e9 100644 --- a/src/guidellm/backends/openai/__init__.py +++ b/src/guidellm/backends/openai/__init__.py @@ -1,5 +1,4 @@ from .http import OpenAIHTTPBackend -from .websocket import OpenAIWebSocketBackend, OpenAIWebSocketBackendArgs from .request_handlers import ( AudioRequestHandler, ChatCompletionsRequestHandler, @@ -8,15 +7,16 @@ ResponsesRequestHandler, TextCompletionsRequestHandler, ) +from .websocket import OpenAIWebSocketBackend, OpenAIWebSocketBackendArgs __all__ = [ "AudioRequestHandler", "ChatCompletionsRequestHandler", "OpenAIHTTPBackend", - "OpenAIWebSocketBackend", - "OpenAIWebSocketBackendArgs", "OpenAIRequestHandler", "OpenAIRequestHandlerFactory", + "OpenAIWebSocketBackend", + "OpenAIWebSocketBackendArgs", "ResponsesRequestHandler", "TextCompletionsRequestHandler", ] diff --git a/src/guidellm/backends/openai/http.py b/src/guidellm/backends/openai/http.py index a96d0f19b..67cd19c6c 100644 --- a/src/guidellm/backends/openai/http.py +++ b/src/guidellm/backends/openai/http.py @@ -233,7 +233,9 @@ def __init__( self.follow_redirects = follow_redirects self.verify = verify self.validate_backend: dict[str, Any] | None = resolve_validate_kwargs( - validate_backend + validate_backend, + self.target, + self.api_routes, ) self.stream: bool = stream self.extras = ( @@ -504,4 +506,3 @@ def _build_headers( self, existing_headers: dict[str, str] | None = None ) -> dict[str, str] | None: return build_headers(self.api_key, existing_headers) - diff --git a/src/guidellm/backends/openai/websocket.py b/src/guidellm/backends/openai/websocket.py index dc0d0f224..3bdf8bc25 100644 --- a/src/guidellm/backends/openai/websocket.py +++ b/src/guidellm/backends/openai/websocket.py @@ -272,7 +272,9 @@ def __init__( self.timeout_connect = timeout_connect self.api_routes = _WS_API_ROUTES self.validate_backend: dict[str, Any] | None = resolve_validate_kwargs( - validate_backend + validate_backend, + self.target, + self.api_routes, ) self.extras = extras or {} self._in_process = False @@ -314,7 +316,6 @@ def _ssl_context(self) -> ssl.SSLContext | None: ctx.verify_mode = ssl.CERT_NONE return ctx - def _build_headers( self, existing_headers: dict[str, str] | None = None ) -> dict[str, str] | None: @@ -369,7 +370,9 @@ async def available_models(self) -> list[str]: if self._async_client is None: raise RuntimeError("Backend not started up for process.") target = f"{self.target}/v1/models" - response = await self._async_client.get(target, headers=build_headers(self.api_key)) + response = await self._async_client.get( + target, headers=build_headers(self.api_key) + ) response.raise_for_status() try: payload: Any = response.json() diff --git a/uv.lock b/uv.lock index bc018d7fd..401f76da7 100644 --- a/uv.lock +++ b/uv.lock @@ -988,7 +988,7 @@ requires-dist = [ { name = "types-toml", marker = "extra == 'dev'" }, { name = "uvloop", specifier = ">=0.18" }, { name = "uvloop", marker = "extra == 'perf'" }, - { name = "websockets", marker = "extra == 'audio'", specifier = ">=13.0,<16.0" }, + { name = "websockets", marker = "extra == 'audio'", specifier = ">=13.0" }, ] provides-extras = ["all", "recommended", "perf", "tokenizers", "audio", "vision", "dev"] From bc76810ce1cb5ac520103f81e4be847259b54142 Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Tue, 5 May 2026 12:54:47 +0300 Subject: [PATCH 09/12] CR Signed-off-by: Uri Shaket --- src/guidellm/backends/backend.py | 6 ++++- src/guidellm/backends/openai/common.py | 11 ++++++--- src/guidellm/backends/openai/websocket.py | 16 +++++++------ tests/unit/backends/test_backend.py | 24 ++++++++++--------- .../schemas/generative/test_entrypoints.py | 21 +++++++++++++--- 5 files changed, 53 insertions(+), 25 deletions(-) diff --git a/src/guidellm/backends/backend.py b/src/guidellm/backends/backend.py index 690b0cd44..4f83afd4c 100644 --- a/src/guidellm/backends/backend.py +++ b/src/guidellm/backends/backend.py @@ -24,7 +24,11 @@ ] -BackendType = Literal["openai_http", "openai_realtime_ws", "vllm_python"] +BackendType = Literal[ + "openai_http", + "openai_websocket", + "vllm_python", +] class BackendArgs(BaseModel): diff --git a/src/guidellm/backends/openai/common.py b/src/guidellm/backends/openai/common.py index 73e489e2b..18e422caf 100644 --- a/src/guidellm/backends/openai/common.py +++ b/src/guidellm/backends/openai/common.py @@ -42,10 +42,15 @@ def resolve_validate_kwargs( api_routes: dict[str, str], ) -> dict[str, Any] | None: """ - Normalize ``validate_backend`` into kwargs for ``httpx`` request(). + Build ``httpx`` request keyword arguments from backend validation settings. - Accepts the same shapes as - :class:`~guidellm.backends.openai.http.OpenAIHTTPBackend`. + ``validate_backend`` may be ``False``/equivalent (skip validation), ``True`` + (default ``GET`` against the ``/health`` route key), a route key present in + ``api_routes`` (resolved to ``{target}/{path}``), a full URL string, or a + ``dict`` that includes ``url`` and optionally ``method`` (default ``GET``). + + :return: Keyword arguments suitable for ``httpx.AsyncClient.request``, or + ``None`` when validation is turned off. """ raw = validate_backend if not raw: diff --git a/src/guidellm/backends/openai/websocket.py b/src/guidellm/backends/openai/websocket.py index 3bdf8bc25..55b9df7aa 100644 --- a/src/guidellm/backends/openai/websocket.py +++ b/src/guidellm/backends/openai/websocket.py @@ -66,7 +66,7 @@ def _require_ws_connect() -> Any: from websockets.asyncio.client import connect as ws_connect except ImportError as exc: raise ImportError( - "The openai_realtime_ws backend requires the 'websockets' package. " + "The openai_websocket backend requires the 'websockets' package. " + _AUDIO_EXTRA_HINT ) from exc return ws_connect @@ -128,7 +128,9 @@ def _load_ws_event(raw: str) -> dict[str, Any]: return parsed -# Lazy import cache (no ``global``); tests may set ``pcm16_append_b64_chunks`` directly. +# Module-level hook for ``guidellm.extras.audio.pcm16_append_b64_chunks``: on first +# realtime encode we assign the imported callable here (see ``_ensure_*``). Unit tests +# patch this attribute so WS logic can be exercised without ``guidellm[audio]``. pcm16_append_b64_chunks: Any = None _pcm_imported_fn: dict[str, Any] = {"fn": None} @@ -142,7 +144,7 @@ def _ensure_pcm16_append_b64_chunks() -> Any: from guidellm.extras.audio import pcm16_append_b64_chunks as fn except ImportError as exc: raise ImportError( - "The openai_realtime_ws backend requires the audio extras for PCM " + "The openai_websocket backend requires the audio extras for PCM " "handling used in realtime transcription. " + _AUDIO_EXTRA_HINT ) from exc _pcm_imported_fn["fn"] = fn @@ -240,7 +242,7 @@ class OpenAIWebSocketBackendArgs(BackendArgs): ) -@Backend.register("openai_realtime_ws") +@Backend.register("openai_websocket") class OpenAIWebSocketBackend(Backend): """WebSocket client for realtime (streaming) audio transcription.""" @@ -261,7 +263,7 @@ def __init__( validate_backend: bool | str | dict[str, Any] = True, extras: dict[str, Any] | None = None, ): - super().__init__(type_="openai_realtime_ws") + super().__init__(type_="openai_websocket") self.target = target.rstrip("/").removesuffix("/v1") self.model = model or "" self.websocket_path = websocket_path @@ -402,7 +404,7 @@ async def resolve( # type: ignore[override, misc] # noqa: C901, PLR0912, PLR09 raise RuntimeError("Backend not started up for process.") if history: raise NotImplementedError( - "openai_realtime_ws does not support multiturn/history yet." + "openai_websocket does not support multiturn/history yet." ) audio_columns = request.columns.get("audio_column", []) @@ -415,7 +417,7 @@ async def resolve( # type: ignore[override, misc] # noqa: C901, PLR0912, PLR09 model_name = await self.default_model() if not str(model_name).strip(): raise RuntimeError( - "No model configured for openai_realtime_ws and /v1/models returned " + "No model configured for openai_websocket and /v1/models returned " "none. Pass --model or ensure the server lists at least one model." ) diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py index d4d6744e5..750e74f3f 100644 --- a/tests/unit/backends/test_backend.py +++ b/tests/unit/backends/test_backend.py @@ -318,22 +318,24 @@ async def default_model(self) -> str: assert backend.type_ == "mock_backend" @pytest.mark.smoke - def test_openai_realtime_ws_backend_registered(self): - """Realtime WebSocket backend is registered and constructible. - - ## WRITTEN BY AI ## - """ + def test_openai_websocket_backend_registered(self): + """WebSocket OpenAI backend is registered; legacy alias remains accepted.""" from guidellm.backends.openai import ( OpenAIWebSocketBackend, OpenAIWebSocketBackendArgs, ) - assert Backend.is_registered("openai_realtime_ws") - realtime_args = Backend.get_backend_args("openai_realtime_ws") - assert realtime_args is OpenAIWebSocketBackendArgs - backend = Backend.create("openai_realtime_ws", target="http://localhost:9000") - assert isinstance(backend, OpenAIWebSocketBackend) - assert backend.type_ == "openai_realtime_ws" + assert Backend.is_registered("openai_websocket") + ws_args = OpenAIWebSocketBackendArgs + assert Backend.get_backend_args("openai_websocket") is ws_args + + by_canonical = Backend.create("openai_websocket", target="http://localhost:9000") + assert isinstance(by_canonical, OpenAIWebSocketBackend) + assert by_canonical.type_ == "openai_websocket" + + by_alias = Backend.create("openai_websocket", target="http://localhost:9000") + assert isinstance(by_alias, OpenAIWebSocketBackend) + assert by_alias.type_ == "openai_websocket" def test_openai_backend_registered(self): """Test that OpenAI HTTP backend is registered.""" diff --git a/tests/unit/benchmark/schemas/generative/test_entrypoints.py b/tests/unit/benchmark/schemas/generative/test_entrypoints.py index 099d0325f..fbf1bdcc1 100644 --- a/tests/unit/benchmark/schemas/generative/test_entrypoints.py +++ b/tests/unit/benchmark/schemas/generative/test_entrypoints.py @@ -53,14 +53,14 @@ def test_dict_backend_kwargs_transformed(self): assert args.backend_kwargs.target == "http://localhost:9000" assert args.backend_kwargs.model == "test_model" - def test_openai_realtime_ws_backend_kwargs_validates(self) -> None: + def test_openai_websocket_backend_kwargs_validates(self) -> None: """Realtime WS backend is selected explicitly; no request_format shim. ## WRITTEN BY AI ## """ args = BenchmarkGenerativeTextArgs.model_validate( { - "backend": "openai_realtime_ws", + "backend": "openai_websocket", "backend_kwargs": { "target": "http://localhost:8000", "model": "rt-model", @@ -68,11 +68,26 @@ def test_openai_realtime_ws_backend_kwargs_validates(self) -> None: "data": ["prompt_tokens=256,output_tokens=128"], } ) - assert args.backend == "openai_realtime_ws" + assert args.backend == "openai_websocket" assert isinstance(args.backend_kwargs, OpenAIWebSocketBackendArgs) assert args.backend_kwargs.target == "http://localhost:8000" assert args.backend_kwargs.model == "rt-model" + def test_openai_websocket_backend_kwargs_validates(self) -> None: + """``openai_websocket`` validates the same way as the legacy registry alias.""" + args = BenchmarkGenerativeTextArgs.model_validate( + { + "backend": "openai_websocket", + "backend_kwargs": { + "target": "http://localhost:8000", + "model": "rt-model", + }, + "data": ["prompt_tokens=256,output_tokens=128"], + } + ) + assert args.backend == "openai_websocket" + assert isinstance(args.backend_kwargs, OpenAIWebSocketBackendArgs) + def test_dict_with_request_format(self): """ Test that request_format is included in BackendArgs transformation. From d60797e72169e7d8568aa1779b5aa55d649942fe Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Tue, 5 May 2026 12:56:48 +0300 Subject: [PATCH 10/12] remove redundant test Signed-off-by: Uri Shaket --- .../schemas/generative/test_entrypoints.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/unit/benchmark/schemas/generative/test_entrypoints.py b/tests/unit/benchmark/schemas/generative/test_entrypoints.py index fbf1bdcc1..8653e065e 100644 --- a/tests/unit/benchmark/schemas/generative/test_entrypoints.py +++ b/tests/unit/benchmark/schemas/generative/test_entrypoints.py @@ -73,21 +73,6 @@ def test_openai_websocket_backend_kwargs_validates(self) -> None: assert args.backend_kwargs.target == "http://localhost:8000" assert args.backend_kwargs.model == "rt-model" - def test_openai_websocket_backend_kwargs_validates(self) -> None: - """``openai_websocket`` validates the same way as the legacy registry alias.""" - args = BenchmarkGenerativeTextArgs.model_validate( - { - "backend": "openai_websocket", - "backend_kwargs": { - "target": "http://localhost:8000", - "model": "rt-model", - }, - "data": ["prompt_tokens=256,output_tokens=128"], - } - ) - assert args.backend == "openai_websocket" - assert isinstance(args.backend_kwargs, OpenAIWebSocketBackendArgs) - def test_dict_with_request_format(self): """ Test that request_format is included in BackendArgs transformation. From 62755f8c0898da163cfa0bd4f5705865a06a466a Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Tue, 5 May 2026 13:02:03 +0300 Subject: [PATCH 11/12] lint Signed-off-by: Uri Shaket --- tests/unit/backends/test_backend.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py index 750e74f3f..30fe8f1aa 100644 --- a/tests/unit/backends/test_backend.py +++ b/tests/unit/backends/test_backend.py @@ -329,7 +329,9 @@ def test_openai_websocket_backend_registered(self): ws_args = OpenAIWebSocketBackendArgs assert Backend.get_backend_args("openai_websocket") is ws_args - by_canonical = Backend.create("openai_websocket", target="http://localhost:9000") + by_canonical = Backend.create( + "openai_websocket", target="http://localhost:9000" + ) assert isinstance(by_canonical, OpenAIWebSocketBackend) assert by_canonical.type_ == "openai_websocket" From 53303249a0e930e32836034a643ab4ff2ec36e93 Mon Sep 17 00:00:00 2001 From: Uri Shaket Date: Tue, 5 May 2026 13:23:55 +0300 Subject: [PATCH 12/12] CR Signed-off-by: Uri Shaket --- src/guidellm/__main__.py | 3 +- src/guidellm/backends/openai/websocket.py | 55 +++++++++++++++++-- .../unit/backends/openai/test_realtime_ws.py | 31 ++++++++++- tests/unit/backends/test_backend.py | 14 ++--- .../schemas/generative/test_entrypoints.py | 19 +++++-- 5 files changed, 99 insertions(+), 23 deletions(-) diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 02ad23bb8..258fa844f 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -166,7 +166,8 @@ def benchmark(): "template per vLLM docs. Default: default-template" "For openai backend: http endpoint path (/v1/chat/completions, " "/v1/completions, /v1/audio/transcriptions, /v1/audio/translations) or " - "alias (e.g. chat_completions); default /v1/chat/completions." + "alias (e.g. chat_completions); default /v1/chat/completions. " + "For openai_websocket: WebSocket URL path such as /v1/realtime (default)." ), ) @click.option( diff --git a/src/guidellm/backends/openai/websocket.py b/src/guidellm/backends/openai/websocket.py index 55b9df7aa..78dfc566d 100644 --- a/src/guidellm/backends/openai/websocket.py +++ b/src/guidellm/backends/openai/websocket.py @@ -19,7 +19,7 @@ from urllib.parse import ParseResult, urlparse import httpx -from pydantic import Field +from pydantic import Field, field_validator if TYPE_CHECKING: from websockets.asyncio.client import ClientConnection @@ -48,6 +48,30 @@ "/v1/models": "v1/models", } +# Default WebSocket HTTP path under target (CLI: --request-format / --request-type). +_DEFAULT_WS_REQUEST_FORMAT = "/v1/realtime" +_WS_REQUEST_FORMAT_ALIASES: dict[str, str] = { + "realtime": _DEFAULT_WS_REQUEST_FORMAT, +} + + +def _effective_websocket_http_path(request_format: str | None) -> str: + """Normalize ``request_format`` to a WebSocket path (``/…`` segment on the host).""" + if request_format is None: + return _DEFAULT_WS_REQUEST_FORMAT + s = request_format.strip() + if not s: + raise ValueError("request_format must not be empty or whitespace") + canonical = _WS_REQUEST_FORMAT_ALIASES.get(s, s) + if not canonical.startswith("/"): + raise ValueError( + "request_format must be a path starting with '/' (for example " + f"{_DEFAULT_WS_REQUEST_FORMAT!r}) or alias " + f"{', '.join(repr(k) for k in _WS_REQUEST_FORMAT_ALIASES)}" + ) + return canonical + + # Guard against a misbehaving server that only emits ignored event types. _MAX_IGNORED_WS_EVENT_TYPES = 50_000 @@ -207,10 +231,29 @@ class OpenAIWebSocketBackendArgs(BackendArgs): default=None, description="Model identifier (required unless discoverable from /v1/models).", ) - websocket_path: str = Field( - default="/v1/realtime", - description="WebSocket path on the server (default /v1/realtime).", + request_format: str | None = Field( + default=None, + description=( + "WebSocket path on the HTTP host (default /v1/realtime). " + "Use the same top-level CLI flags as ``openai_http``: " + "--request-format / --request-type." + ), + json_schema_extra={ + "error_message": ( + "Backend '{backend_type}' received an invalid --request-format / " + f"request_format. Use {_DEFAULT_WS_REQUEST_FORMAT!r} or another " + "path starting with '/'." + ) + }, ) + + @field_validator("request_format") + @classmethod + def validate_request_format(cls, v: str | None) -> str | None: + if v is None: + return None + return _effective_websocket_http_path(v) + chunk_samples: int = Field( default=3200, ge=1, @@ -254,7 +297,7 @@ def __init__( self, target: str, model: str = "", - websocket_path: str = "/v1/realtime", + request_format: str | None = None, chunk_samples: int = 3200, api_key: str | None = None, verify: bool = False, @@ -266,7 +309,7 @@ def __init__( super().__init__(type_="openai_websocket") self.target = target.rstrip("/").removesuffix("/v1") self.model = model or "" - self.websocket_path = websocket_path + self.websocket_path = _effective_websocket_http_path(request_format) self.chunk_samples = chunk_samples self.api_key = api_key self.verify = verify diff --git a/tests/unit/backends/openai/test_realtime_ws.py b/tests/unit/backends/openai/test_realtime_ws.py index 8c7d7c597..7f0fd497b 100644 --- a/tests/unit/backends/openai/test_realtime_ws.py +++ b/tests/unit/backends/openai/test_realtime_ws.py @@ -10,6 +10,7 @@ import json import pytest +from pydantic import ValidationError try: from websockets.asyncio.server import serve @@ -20,6 +21,7 @@ allow_module_level=True, ) +from guidellm.backends.backend import Backend from guidellm.backends.openai.websocket import ( _DEFAULT_WS_RECV_TIMEOUT, OpenAIWebSocketBackend, @@ -704,8 +706,33 @@ async def handler(ws: object) -> None: await be.process_shutdown() -def test_openai_realtime_backend_args_model() -> None: +def test_openai_websocket_backend_args_model() -> None: a = OpenAIWebSocketBackendArgs(target="http://localhost:8000", model="x") - assert a.websocket_path == "/v1/realtime" + assert a.request_format is None assert a.chunk_samples == 3200 assert a.timeout == _DEFAULT_WS_RECV_TIMEOUT + + +def test_openai_websocket_backend_args_normalizes_request_format_alias() -> None: + args = OpenAIWebSocketBackendArgs( + target="http://localhost:8000", + request_format="realtime", + ) + assert args.request_format == "/v1/realtime" + + +def test_openai_websocket_backend_resolves_websocket_path_from_request_format() -> None: + backend = Backend.create( + "openai_websocket", + target="http://127.0.0.1:9", + request_format="/custom/ws", + ) + assert backend.websocket_path == "/custom/ws" + + +def test_openai_websocket_backend_args_invalid_request_format_rejected() -> None: + with pytest.raises(ValidationError): + OpenAIWebSocketBackendArgs( + target="http://localhost:8000", + request_format="nope", + ) diff --git a/tests/unit/backends/test_backend.py b/tests/unit/backends/test_backend.py index 30fe8f1aa..271117c3d 100644 --- a/tests/unit/backends/test_backend.py +++ b/tests/unit/backends/test_backend.py @@ -319,7 +319,7 @@ async def default_model(self) -> str: @pytest.mark.smoke def test_openai_websocket_backend_registered(self): - """WebSocket OpenAI backend is registered; legacy alias remains accepted.""" + """WebSocket OpenAI backend is registered and constructible.""" from guidellm.backends.openai import ( OpenAIWebSocketBackend, OpenAIWebSocketBackendArgs, @@ -329,15 +329,9 @@ def test_openai_websocket_backend_registered(self): ws_args = OpenAIWebSocketBackendArgs assert Backend.get_backend_args("openai_websocket") is ws_args - by_canonical = Backend.create( - "openai_websocket", target="http://localhost:9000" - ) - assert isinstance(by_canonical, OpenAIWebSocketBackend) - assert by_canonical.type_ == "openai_websocket" - - by_alias = Backend.create("openai_websocket", target="http://localhost:9000") - assert isinstance(by_alias, OpenAIWebSocketBackend) - assert by_alias.type_ == "openai_websocket" + backend = Backend.create("openai_websocket", target="http://localhost:9000") + assert isinstance(backend, OpenAIWebSocketBackend) + assert backend.type_ == "openai_websocket" def test_openai_backend_registered(self): """Test that OpenAI HTTP backend is registered.""" diff --git a/tests/unit/benchmark/schemas/generative/test_entrypoints.py b/tests/unit/benchmark/schemas/generative/test_entrypoints.py index 8653e065e..ad5f8618c 100644 --- a/tests/unit/benchmark/schemas/generative/test_entrypoints.py +++ b/tests/unit/benchmark/schemas/generative/test_entrypoints.py @@ -54,10 +54,7 @@ def test_dict_backend_kwargs_transformed(self): assert args.backend_kwargs.model == "test_model" def test_openai_websocket_backend_kwargs_validates(self) -> None: - """Realtime WS backend is selected explicitly; no request_format shim. - - ## WRITTEN BY AI ## - """ + """WebSocket backend accepts ``request_format`` (CLI --request-format).""" args = BenchmarkGenerativeTextArgs.model_validate( { "backend": "openai_websocket", @@ -72,6 +69,20 @@ def test_openai_websocket_backend_kwargs_validates(self) -> None: assert isinstance(args.backend_kwargs, OpenAIWebSocketBackendArgs) assert args.backend_kwargs.target == "http://localhost:8000" assert args.backend_kwargs.model == "rt-model" + assert args.backend_kwargs.request_format is None + + with_format = BenchmarkGenerativeTextArgs.model_validate( + { + "backend": "openai_websocket", + "backend_kwargs": { + "target": "http://localhost:8000", + "model": "rt-model", + "request_format": "realtime", + }, + "data": ["prompt_tokens=256,output_tokens=128"], + } + ) + assert with_format.backend_kwargs.request_format == "/v1/realtime" def test_dict_with_request_format(self): """