From 4e43c8275a92a4b0d657f0435b943d78fecc2e82 Mon Sep 17 00:00:00 2001 From: Hitesh Joshi <217911+hiteshjoshi@users.noreply.github.com> Date: Fri, 18 Apr 2025 13:24:35 +0530 Subject: [PATCH 01/48] remove debugging (#2) * remove debugging From c9cff364d6fbf013afbb62c909684e4dbe402e3d Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 13:19:00 +0200 Subject: [PATCH 02/48] add newest Realtime voices --- src/realtime/types.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index a90ff27f..04292d93 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -32,8 +32,10 @@ pub enum RealtimeVoice { Alloy, Ash, Ballad, + Cedar, Coral, Echo, + Marin, Sage, Shimmer, Verse, From 561b198d4df5a8dc2db5a856bf4a8ca6dedd4c86 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 13:19:17 +0200 Subject: [PATCH 03/48] add dedicated RealtimeSIP client --- src/realtime/api.rs | 2 ++ src/realtime/api/sip.rs | 43 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 src/realtime/api/sip.rs diff --git a/src/realtime/api.rs b/src/realtime/api.rs index 82faeb8b..cf901b39 100644 --- a/src/realtime/api.rs +++ b/src/realtime/api.rs @@ -7,6 +7,8 @@ use tokio_tungstenite::{ MaybeTlsStream, WebSocketStream, }; +pub mod sip; + const WSS_URL: &str = "wss://api.openai.com/v1/realtime"; pub struct RealtimeClient { diff --git a/src/realtime/api/sip.rs b/src/realtime/api/sip.rs new file mode 100644 index 00000000..a5555612 --- /dev/null +++ b/src/realtime/api/sip.rs @@ -0,0 +1,43 @@ +use super::*; + +/// Intended for connecting to an already existing Realtime session spawned by accepting an incoming SIP call from e.g. Twilio. +pub struct RealtimeSipClient { + pub wss_url: String, + pub api_key: String, + pub call_id: String, +} + +impl RealtimeSipClient { + pub fn new(api_key: String, call_id: String) -> Self { + let wss_url = std::env::var("WSS_URL").unwrap_or_else(|_| WSS_URL.to_owned()); + Self::new_with_endpoint(wss_url, api_key, call_id) + } + + pub fn new_with_endpoint(wss_url: String, api_key: String, call_id: String) -> Self { + Self { + wss_url, + api_key, + call_id, + } + } + + pub async fn connect( + &self, + ) -> Result< + ( + SplitSink>, Message>, + SplitStream>>, + ), + Box, + > { + let url = format!("{}?callId={}", self.wss_url, self.call_id); + let mut request = url.into_client_request()?; + let api_key = self.api_key.clone(); + request + .headers_mut() + .insert("Authorization", format!("Bearer {api_key}").parse()?); + let (ws_stream, _) = connect_async(request).await?; + let (write, read) = ws_stream.split(); + Ok((write, read)) + } +} From 2ea3ae17e1061f1f010e81c916ced53fedfa1806 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 13:53:21 +0200 Subject: [PATCH 04/48] add accept/reject/refer SIP calls to v1 client --- src/v1/api.rs | 27 +++++++++++++++++++++++++++ src/v1/calls.rs | 19 +++++++++++++++++++ src/v1/mod.rs | 1 + 3 files changed, 47 insertions(+) create mode 100644 src/v1/calls.rs diff --git a/src/v1/api.rs b/src/v1/api.rs index ce0b1c4e..a560de0d 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -7,6 +7,7 @@ use crate::v1::audio::{ AudioTranslationRequest, AudioTranslationResponse, }; use crate::v1::batch::{BatchResponse, CreateBatchRequest, ListBatchResponse}; +use crate::v1::calls::{AcceptCallRequest, ReferCallRequest}; use crate::v1::chat_completion::{ChatCompletionRequest, ChatCompletionResponse}; use crate::v1::common; use crate::v1::completion::{CompletionRequest, CompletionResponse}; @@ -796,6 +797,32 @@ impl OpenAIClient { self.delete(&format!("models/{model_id}")).await } + pub async fn accept_call( + &mut self, + call_id: &str, + accept: AcceptCallRequest, + ) -> Result<(), APIError> { + self.post::<()>(&format!("realtime/calls/{call_id}/accept"), &accept) + .await?; + Ok(()) + } + + pub async fn reject_call(&mut self, call_id: &str) -> Result<(), APIError> { + self.post::<()>(&format!("realtime/calls/{call_id}/reject"), &()) + .await?; + Ok(()) + } + + pub async fn refer_call( + &mut self, + call_id: &str, + refer: ReferCallRequest, + ) -> Result<(), APIError> { + self.post::<()>(&format!("realtime/calls/{call_id}/refer"), &refer) + .await?; + Ok(()) + } + fn build_url_with_preserved_query(&self, path: &str) -> Result { let (base, query_opt) = match self.api_endpoint.split_once('?') { Some((b, q)) => (b.trim_end_matches('/'), Some(q)), diff --git a/src/v1/calls.rs b/src/v1/calls.rs new file mode 100644 index 00000000..dbf1b073 --- /dev/null +++ b/src/v1/calls.rs @@ -0,0 +1,19 @@ +use serde::{Deserialize, Serialize}; + +/// Used to start a realtime session based on an incoming call. +/// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AcceptCallRequest { + /// This is *always* `realtime`. + #[serde(rename = "type")] + pub session_type: String, + pub instructions: String, + pub model: String, +} + +/// Used to redirect a call to another number. Per https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook the Tel-URI scheme may be used. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ReferCallRequest { + /// The URI to redirect the call to, for example `tel:+14152909007` + pub target_uri: String, +} diff --git a/src/v1/mod.rs b/src/v1/mod.rs index d44ed319..dbbe00c7 100644 --- a/src/v1/mod.rs +++ b/src/v1/mod.rs @@ -4,6 +4,7 @@ pub mod types; pub mod audio; pub mod batch; +pub mod calls; pub mod chat_completion; pub mod completion; pub mod edit; From 1021ab5f4099b00d5ed2b408746d408fcd0a1eea Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 13:53:58 +0200 Subject: [PATCH 05/48] rename module to reflect endpoint naming --- src/v1/api.rs | 2 +- src/v1/mod.rs | 2 +- src/v1/{calls.rs => realtime_calls.rs} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename src/v1/{calls.rs => realtime_calls.rs} (100%) diff --git a/src/v1/api.rs b/src/v1/api.rs index a560de0d..f2ff144e 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -7,7 +7,6 @@ use crate::v1::audio::{ AudioTranslationRequest, AudioTranslationResponse, }; use crate::v1::batch::{BatchResponse, CreateBatchRequest, ListBatchResponse}; -use crate::v1::calls::{AcceptCallRequest, ReferCallRequest}; use crate::v1::chat_completion::{ChatCompletionRequest, ChatCompletionResponse}; use crate::v1::common; use crate::v1::completion::{CompletionRequest, CompletionResponse}; @@ -33,6 +32,7 @@ use crate::v1::message::{ }; use crate::v1::model::{ModelResponse, ModelsResponse}; use crate::v1::moderation::{CreateModerationRequest, CreateModerationResponse}; +use crate::v1::realtime_calls::{AcceptCallRequest, ReferCallRequest}; use crate::v1::run::{ CreateRunRequest, CreateThreadAndRunRequest, ListRun, ListRunStep, ModifyRunRequest, RunObject, RunStepObject, diff --git a/src/v1/mod.rs b/src/v1/mod.rs index dbbe00c7..13e3d9c5 100644 --- a/src/v1/mod.rs +++ b/src/v1/mod.rs @@ -4,7 +4,6 @@ pub mod types; pub mod audio; pub mod batch; -pub mod calls; pub mod chat_completion; pub mod completion; pub mod edit; @@ -14,6 +13,7 @@ pub mod fine_tuning; pub mod image; pub mod model; pub mod moderation; +pub mod realtime_calls; // beta pub mod assistant; diff --git a/src/v1/calls.rs b/src/v1/realtime_calls.rs similarity index 100% rename from src/v1/calls.rs rename to src/v1/realtime_calls.rs From d6af149f367c5aa0be66f712f8ebe4eaad1f3c92 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 13:56:08 +0200 Subject: [PATCH 06/48] add doc comment --- src/v1/realtime_calls.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index dbf1b073..c7632462 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; -/// Used to start a realtime session based on an incoming call. +/// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. /// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct AcceptCallRequest { From 0e082543e7121d08f8372132020497e7461cd61e Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 14:01:44 +0200 Subject: [PATCH 07/48] strongly typed realtime model enum --- src/realtime/types.rs | 18 ++++++++++++++++++ src/v1/realtime_calls.rs | 15 +++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 04292d93..65173a01 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -26,6 +26,24 @@ pub struct Session { pub max_output_tokens: Option, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum RealtimeModel { + #[serde(rename = "gpt-realtime")] + GptRealtime, + #[serde(rename = "gpt-4o-realtime-preview")] + Gpt4oRealtimePreview, + #[serde(rename = "gpt-4o-mini-realtime-preview")] + Gpt4oMiniRealtimePreview, + #[serde(rename = "gpt-realtime-2025-08-28")] + GptRealtime20250828, + #[serde(rename = "gpt-4o-realtime-preview-2024-12-17")] + Gpt4oRealtimePreview20241217, + #[serde(rename = "gpt-4o-realtime-preview-2024-10-01")] + Gpt4oRealtimePreview20241001, + #[serde(rename = "gpt-4o-mini-realtime-preview-2024-12-17")] + Gpt4oMiniRealtimePreview20241217, +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "lowercase")] pub enum RealtimeVoice { diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index c7632462..de0705cc 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -1,14 +1,25 @@ use serde::{Deserialize, Serialize}; +use crate::realtime::types::RealtimeModel; + /// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. /// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct AcceptCallRequest { - /// This is *always* `realtime`. + /// This is *always* `realtime`. Convenience constructor exposed to ensure this. #[serde(rename = "type")] pub session_type: String, pub instructions: String, - pub model: String, + pub model: RealtimeModel, +} +impl AcceptCallRequest { + pub fn new(instructions: String, model: RealtimeModel) -> Self { + Self { + session_type: "realtime".to_string(), + instructions, + model, + } + } } /// Used to redirect a call to another number. Per https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook the Tel-URI scheme may be used. From 5ce9e2630a9d9ae0264619156729afc6ab8a66e8 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 14:09:10 +0200 Subject: [PATCH 08/48] use one-off unit struct for "type": "realtime" field --- src/realtime/types.rs | 5 +++++ src/v1/realtime_calls.rs | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 65173a01..0c08a857 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -44,6 +44,11 @@ pub enum RealtimeModel { Gpt4oMiniRealtimePreview20241217, } +/// Unit struct representing the only possible value for `type` in the accept call payload. +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename = "realtime")] +pub struct RealtimeSessionType; + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "lowercase")] pub enum RealtimeVoice { diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index de0705cc..253387f0 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; -use crate::realtime::types::RealtimeModel; +use crate::realtime::types::{RealtimeModel, RealtimeSessionType}; /// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. /// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. @@ -8,14 +8,14 @@ use crate::realtime::types::RealtimeModel; pub struct AcceptCallRequest { /// This is *always* `realtime`. Convenience constructor exposed to ensure this. #[serde(rename = "type")] - pub session_type: String, + pub session_type: RealtimeSessionType, pub instructions: String, pub model: RealtimeModel, } impl AcceptCallRequest { pub fn new(instructions: String, model: RealtimeModel) -> Self { Self { - session_type: "realtime".to_string(), + session_type: RealtimeSessionType, instructions, model, } From 01484baf67b14ebe7d3e040e3db23a60d701c24b Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 14:44:07 +0200 Subject: [PATCH 09/48] use string response body type, add hangup endpoint, add notes on reject call endpoint (tldr: useless) --- src/v1/api.rs | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/v1/api.rs b/src/v1/api.rs index f2ff144e..b6d57954 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -802,13 +802,30 @@ impl OpenAIClient { call_id: &str, accept: AcceptCallRequest, ) -> Result<(), APIError> { - self.post::<()>(&format!("realtime/calls/{call_id}/accept"), &accept) + // /realtime/calls endpoints return empty strings on success + self.post::(&format!("realtime/calls/{call_id}/accept"), &accept) .await?; Ok(()) } + pub async fn hangup_call(&mut self, call_id: &str) -> Result<(), APIError> { + // /realtime/calls endpoints return empty strings on success + self.post::(&format!("realtime/calls/{call_id}/hangup"), &()) + .await?; + Ok(()) + } + + /// Note that `reject_call` is very poorly documented and seems to be non-functional even in the GA release as of 2025-09-11: + /// + /// - it returns a 404 if there is no session associated with the call (ie. it hasn't been `accept`ed yet) + /// - it returns a 500 if there *is* one + /// - in neither case does the call actually end + /// + /// Per https://community.openai.com/t/how-can-i-programatically-end-a-gpt-realtime-sip-call/1355362 a `hangup` method exists, not documented elsewhere; + /// a sensible workaround is to `accept` the call and immediately `hangup`. See `hangup_call`. pub async fn reject_call(&mut self, call_id: &str) -> Result<(), APIError> { - self.post::<()>(&format!("realtime/calls/{call_id}/reject"), &()) + // ditto WRT successful body + self.post::(&format!("realtime/calls/{call_id}/reject"), &()) .await?; Ok(()) } @@ -818,7 +835,8 @@ impl OpenAIClient { call_id: &str, refer: ReferCallRequest, ) -> Result<(), APIError> { - self.post::<()>(&format!("realtime/calls/{call_id}/refer"), &refer) + // ditto WRT successful body + self.post::(&format!("realtime/calls/{call_id}/refer"), &refer) .await?; Ok(()) } From 8d3789365cae04866a2608dd9345a849b60faf18 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 14:50:59 +0200 Subject: [PATCH 10/48] make OpenAIClient cloneable --- src/v1/api.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/v1/api.rs b/src/v1/api.rs index b6d57954..6b41f5a2 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -65,7 +65,7 @@ pub struct OpenAIClientBuilder { headers: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct OpenAIClient { api_endpoint: String, api_key: Option, From 73c50ef28dfc5c3f0b036e967a920e21b04ab984 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 14:58:47 +0200 Subject: [PATCH 11/48] accept anything Into for accept call constructor --- src/v1/realtime_calls.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index 253387f0..303fed81 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -13,10 +13,10 @@ pub struct AcceptCallRequest { pub model: RealtimeModel, } impl AcceptCallRequest { - pub fn new(instructions: String, model: RealtimeModel) -> Self { + pub fn new(instructions: impl Into, model: RealtimeModel) -> Self { Self { session_type: RealtimeSessionType, - instructions, + instructions: instructions.into(), model, } } From 992401868dee92c09aa4699354288000f5effac7 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 15:08:18 +0200 Subject: [PATCH 12/48] fix: add new helper method that ignores the response body, use it for realtime/calls methods --- src/v1/api.rs | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/src/v1/api.rs b/src/v1/api.rs index 6b41f5a2..1defa82d 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -195,6 +195,32 @@ impl OpenAIClient { self.handle_response(response).await } + /// `POST`s but expects an empty response rather than anything to deserialize. + async fn post_empty( + &mut self, + path: &str, + body: &impl serde::ser::Serialize, + ) -> Result<(), APIError> { + let request = self.build_request(Method::POST, path).await; + let request = request.json(body); + let response = request.send().await?; + + if response.status().is_success() { + let headers = response.headers().clone(); + self.response_headers = Some(headers); + Ok(()) + } else { + let status = response.status(); + let error_message = response + .text() + .await + .unwrap_or_else(|_| format!("Unknown error - no body text was provided")); + Err(APIError::CustomError { + message: format!("{status}: {error_message}"), + }) + } + } + async fn get(&mut self, path: &str) -> Result { let request = self.build_request(Method::GET, path).await; let response = request.send().await?; @@ -802,17 +828,15 @@ impl OpenAIClient { call_id: &str, accept: AcceptCallRequest, ) -> Result<(), APIError> { - // /realtime/calls endpoints return empty strings on success - self.post::(&format!("realtime/calls/{call_id}/accept"), &accept) - .await?; - Ok(()) + // /realtime/calls endpoints return empty responses on success + self.post_empty(&format!("realtime/calls/{call_id}/accept"), &accept) + .await } pub async fn hangup_call(&mut self, call_id: &str) -> Result<(), APIError> { - // /realtime/calls endpoints return empty strings on success - self.post::(&format!("realtime/calls/{call_id}/hangup"), &()) - .await?; - Ok(()) + // /realtime/calls endpoints return empty responses on success + self.post_empty(&format!("realtime/calls/{call_id}/hangup"), &()) + .await } /// Note that `reject_call` is very poorly documented and seems to be non-functional even in the GA release as of 2025-09-11: @@ -825,9 +849,8 @@ impl OpenAIClient { /// a sensible workaround is to `accept` the call and immediately `hangup`. See `hangup_call`. pub async fn reject_call(&mut self, call_id: &str) -> Result<(), APIError> { // ditto WRT successful body - self.post::(&format!("realtime/calls/{call_id}/reject"), &()) - .await?; - Ok(()) + self.post_empty(&format!("realtime/calls/{call_id}/reject"), &()) + .await } pub async fn refer_call( @@ -836,9 +859,8 @@ impl OpenAIClient { refer: ReferCallRequest, ) -> Result<(), APIError> { // ditto WRT successful body - self.post::(&format!("realtime/calls/{call_id}/refer"), &refer) - .await?; - Ok(()) + self.post_empty(&format!("realtime/calls/{call_id}/refer"), &refer) + .await } fn build_url_with_preserved_query(&self, path: &str) -> Result { From 48a75156bf178d576b36fd61494b37aa8e34a8e8 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 15:42:01 +0200 Subject: [PATCH 13/48] fix: use enum instead of unit struct for `type: "realtime"` due to serialization behavior for unit structs (was written as `null`) --- src/realtime/types.rs | 8 +++++--- src/v1/realtime_calls.rs | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 0c08a857..9611e665 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -44,10 +44,12 @@ pub enum RealtimeModel { Gpt4oMiniRealtimePreview20241217, } -/// Unit struct representing the only possible value for `type` in the accept call payload. +/// Enum representing the only possible value for `type` in the accept call payload. #[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(rename = "realtime")] -pub struct RealtimeSessionType; +#[serde(rename_all = "lowercase")] +pub enum AcceptCallSessionType { + Realtime, +} #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "lowercase")] diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index 303fed81..abc7bf14 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; -use crate::realtime::types::{RealtimeModel, RealtimeSessionType}; +use crate::realtime::types::{AcceptCallSessionType, RealtimeModel}; /// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. /// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. @@ -8,14 +8,14 @@ use crate::realtime::types::{RealtimeModel, RealtimeSessionType}; pub struct AcceptCallRequest { /// This is *always* `realtime`. Convenience constructor exposed to ensure this. #[serde(rename = "type")] - pub session_type: RealtimeSessionType, + pub session_type: AcceptCallSessionType, pub instructions: String, pub model: RealtimeModel, } impl AcceptCallRequest { pub fn new(instructions: impl Into, model: RealtimeModel) -> Self { Self { - session_type: RealtimeSessionType, + session_type: AcceptCallSessionType::Realtime, instructions: instructions.into(), model, } From 7cca2e607589528d7d60603a70cbdd850433f803 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 15:54:58 +0200 Subject: [PATCH 14/48] add webhook payload type to library --- src/realtime/api/sip.rs | 138 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/src/realtime/api/sip.rs b/src/realtime/api/sip.rs index a5555612..6e09ac57 100644 --- a/src/realtime/api/sip.rs +++ b/src/realtime/api/sip.rs @@ -1,3 +1,5 @@ +use serde::{Deserialize, Serialize}; + use super::*; /// Intended for connecting to an already existing Realtime session spawned by accepting an incoming SIP call from e.g. Twilio. @@ -41,3 +43,139 @@ impl RealtimeSipClient { Ok((write, read)) } } + +/// This is the payload of a `realtime.call.incoming` event webhook which is what OpenAI sends to your application when a call hits the SIP endpoint for your project. +/// Exposes some convenience methods for when a call comes from Twilio which is one of the more common use cases. `openai_call_id()` is what you will need to use accept/hangup endpoints. +/// +/// # Example +/// ```rust +/// const INSTRUCTIONS: &str = "You are a helpful assistant."; +/// #[axum::debug_handler] +/// async fn call_webhook( +/// State(mut state): State, +/// Json(event): Json, +/// ) -> impl IntoResponse { +/// let number = event.caller_number(); +/// let call_id = event.openai_call_id(); +/// let twilio_sid = event.twilio_call_sid(); +/// let account_sid = event.twilio_account_sid(); +/// log::info!( +/// "Call coming in from {:?} with OpenAi ID {:?}, Twilio SID {:?} / account SID {:?}", +/// number, +/// call_id, +/// twilio_sid, +/// account_sid +/// ); +/// +/// let accept_call = AcceptCallRequest::new(INSTRUCTIONS, RealtimeModel::GptRealtime); +/// +/// match state.openai_client.accept_call(call_id, accept_call).await { +/// Ok(_) => { +/// log::info!("Accepted call {}", call_id); +/// } +/// Err(err) => { +/// log::error!("Failed to accept call {}: {}", call_id, err); +/// } +/// }; +/// () +/// } +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RealtimeCallIncoming { + pub id: String, + /// Always `event`. + pub object: String, + pub created_at: i64, + /// This should always be `realtime.call.incoming`. + #[serde(rename = "type")] + pub event_type: String, + /// Contains the actual unique data per call. Look for `call_id` here or call `openai_call_id()`. + pub data: RealTimeCallIncomingData, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RealTimeCallIncomingData { + pub call_id: String, + pub sip_headers: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SipHeader { + pub name: String, + pub value: String, +} + +impl RealtimeCallIncoming { + /// Get the call ID from the event data + pub fn openai_call_id(&self) -> &str { + &self.data.call_id + } + + /// Extract the caller's phone number from the "From" SIP header + pub fn caller_number(&self) -> Option { + self.data + .sip_headers + .iter() + .find(|header| header.name == "From") + .and_then(|header| { + // Parse the From header to extract the phone number + // Format: "+48123123123" ;tag=... + if let Some(start) = header.value.find('"') { + if let Some(end) = header.value[start + 1..].find('"') { + return Some(header.value[start + 1..start + 1 + end].to_string()); + } + } + None + }) + } + + /// Get the Twilio Call SID from the X-Twilio-CallSid SIP header + pub fn twilio_call_sid(&self) -> Option<&str> { + self.data + .sip_headers + .iter() + .find(|header| header.name == "X-Twilio-CallSid") + .map(|header| header.value.as_str()) + } + + /// Get the Twilio Account SID from the X-Twilio-AccountSid SIP header + pub fn twilio_account_sid(&self) -> Option<&str> { + self.data + .sip_headers + .iter() + .find(|header| header.name == "X-Twilio-AccountSid") + .map(|header| header.value.as_str()) + } + + /// Get a specific SIP header value by name + pub fn get_sip_header(&self, name: &str) -> Option<&str> { + self.data + .sip_headers + .iter() + .find(|header| header.name == name) + .map(|header| header.value.as_str()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_twilio_event() { + let json = r#"{"id": "evt_68bc6828707881908be189456b84cc07", "object": "event", "created_at": 1757177896, "type": "realtime.call.incoming", "data": {"call_id": "rtc_c5b6f97fe96f4c809b78916a9ac15748", "sip_headers": [{"name": "From", "value": "\"+48123123123\" ;tag=82568196_c3356d0b_03f1232a-01cf-4a4a-af25-bac077219d08"}, {"name": "X-Twilio-CallSid", "value": "CA080dd4bebc0320639d7ae33b82e80481"}, {"name": "X-Twilio-AccountSid", "value": "fake_data"}]}}"#; + + let event: RealtimeCallIncoming = serde_json::from_str(json).unwrap(); + + assert_eq!( + event.openai_call_id(), + "rtc_c5b6f97fe96f4c809b78916a9ac15748" + ); + assert_eq!(event.caller_number(), Some("+48123123123".to_string())); + assert_eq!( + event.twilio_call_sid(), + Some("CA080dd4bebc0320639d7ae33b82e80481") + ); + assert_eq!(event.twilio_account_sid(), Some("fake_data")); + } +} From 4e1b385804f2063b24e284274d78a5088e8ff562 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 18:53:57 +0200 Subject: [PATCH 15/48] wip: rename session update fields --- src/realtime/types.rs | 33 +++++++++++++++++++++++++++++---- src/v1/realtime_calls.rs | 6 +++--- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 9611e665..60a9d171 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -2,8 +2,17 @@ use serde::{Deserialize, Serialize}; #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct Session { + /// Always `realtime` if specified. + #[serde(rename = "type", skip_serializing_if = "Option::is_none")] + pub session_type: Option, + // todo: audio #[serde(skip_serializing_if = "Option::is_none")] - pub modalities: Option>, + pub include: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + /// Just `Audio` by default. Can also be `Text` for text-only. Both at the same time are not supported. + #[serde(skip_serializing_if = "Option::is_none")] + pub output_modalities: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub instructions: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -21,9 +30,11 @@ pub struct Session { #[serde(skip_serializing_if = "Option::is_none")] pub tool_choice: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub temperature: Option, - #[serde(skip_serializing_if = "Option::is_none")] pub max_output_tokens: Option, + // Todo: Support prompt template reference and variables + // #[serde(skip_serializing_if = "Option::is_none")] + // pub prompt: Option, + // } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -44,10 +55,24 @@ pub enum RealtimeModel { Gpt4oMiniRealtimePreview20241217, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum AdditionalServerOutput { + /// Include logprobs for input audio transcription. + #[serde(rename = "item.input_audio_transcription.logprobs")] + Logprobs, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum OutputModality { + Audio, + Text, +} + /// Enum representing the only possible value for `type` in the accept call payload. #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "lowercase")] -pub enum AcceptCallSessionType { +pub enum RealtimeCallSessionType { Realtime, } diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index abc7bf14..1d8ba1d3 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -1,6 +1,6 @@ use serde::{Deserialize, Serialize}; -use crate::realtime::types::{AcceptCallSessionType, RealtimeModel}; +use crate::realtime::types::{RealtimeCallSessionType, RealtimeModel}; /// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. /// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. @@ -8,14 +8,14 @@ use crate::realtime::types::{AcceptCallSessionType, RealtimeModel}; pub struct AcceptCallRequest { /// This is *always* `realtime`. Convenience constructor exposed to ensure this. #[serde(rename = "type")] - pub session_type: AcceptCallSessionType, + pub session_type: RealtimeCallSessionType, pub instructions: String, pub model: RealtimeModel, } impl AcceptCallRequest { pub fn new(instructions: impl Into, model: RealtimeModel) -> Self { Self { - session_type: AcceptCallSessionType::Realtime, + session_type: RealtimeCallSessionType::Realtime, instructions: instructions.into(), model, } From c2bcd938846570c087712d72d7e4f6f8e1d685cd Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 19:53:40 +0200 Subject: [PATCH 16/48] add new GA shape of audio config --- src/realtime/types.rs | 164 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 148 insertions(+), 16 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 60a9d171..726a1430 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -5,7 +5,8 @@ pub struct Session { /// Always `realtime` if specified. #[serde(rename = "type", skip_serializing_if = "Option::is_none")] pub session_type: Option, - // todo: audio + #[serde(skip_serializing_if = "Option::is_none")] + pub audio: Option, #[serde(skip_serializing_if = "Option::is_none")] pub include: Option>, #[serde(skip_serializing_if = "Option::is_none")] @@ -16,16 +17,6 @@ pub struct Session { #[serde(skip_serializing_if = "Option::is_none")] pub instructions: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub voice: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub input_audio_format: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub output_audio_format: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub input_audio_transcription: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub turn_detection: Option, - #[serde(skip_serializing_if = "Option::is_none")] pub tools: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub tool_choice: Option, @@ -34,7 +25,12 @@ pub struct Session { // Todo: Support prompt template reference and variables // #[serde(skip_serializing_if = "Option::is_none")] // pub prompt: Option, - // + // Todo: Support tracing config + // #[serde(skip_serializing_if = "Option::is_none")] + // pub tracing: Option, // "auto" or config object + // Todo: Support truncation config (poorly documented atm) + // #[serde(skip_serializing_if = "Option::is_none")] + // pub tracing: Option, // "auto" or config object } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -92,12 +88,148 @@ pub enum RealtimeVoice { } #[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioConfig { + pub input: AudioInput, + pub output: AudioOutput, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioInput { + pub format: AudioFormat, + /// Configuration for input audio noise reduction. This can be set to null to turn off. Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. + /// Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. + pub noise_reduction: Option, + /// Configuration for input audio transcription, defaults to off and can be set to null to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through the /audio/transcriptions endpoint and should be treated as guidance of input audio content rather than precisely what the model heard. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service. + pub transcription: Option, + /// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to null to turn off, in which case the client must manually trigger model response. + pub turn_detection: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TranscriptionConfig { + /// The language of the input audio in ISO-639-1 (e.g. "en") format. Will improve accuracy and latency if set. + #[serde(skip_serializing_if = "Option::is_none")] + pub language: Option, + pub model: TranscriptionModel, + /// An optional text to guide the model's style or continue a previous audio segment. For `whisper-1`, the prompt is a list of keywords. For `gpt-4o-transcribe` models, the prompt is a free text string, for example "expect words related to technology". + #[serde(skip_serializing_if = "Option::is_none")] + pub prompt: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum TranscriptionModel { + #[serde(rename = "whisper-1")] + Whisper1, + #[serde(rename = "gpt-4o-transcribe-latest")] + Gpt4oTranscribeLatest, + #[serde(rename = "gpt-4o-mini-transcribe")] + Gpt4oMiniTranscribe, + #[serde(rename = "gpt-4o-transcribe")] + Gpt4oTranscribe, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum VadMode { + SemanticVad(SemanticVadConfig), +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ServerVadConfig { + /// Whether or not to automatically generate a response when a VAD stop event occurs. + pub create_response: bool, + /// Optional timeout after which a model response will be triggered automatically. This is useful for situations in which a long pause from the user is unexpected, such as a phone call. The model will effectively prompt the user to continue the conversation based on the current context. + /// The timeout value will be applied after the last model response's audio has finished playing, i.e. it's set to the `response.done` time plus audio playback duration. + /// An `input_audio_buffer.timeout_triggered` event (plus events associated with the Response) will be emitted when the timeout is reached. Idle timeout is currently only supported for server_vad mode. + pub idle_timeout_ms: Option, + /// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. + pub interrupt_response: bool, + /// Used only for server_vad mode. Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. + #[serde(skip_serializing_if = "Option::is_none")] + pub prefix_padding_ms: Option, + /// Used only for server_vad mode. Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. With shorter values the model will respond more quickly, but may jump in on short pauses from the user. + #[serde(skip_serializing_if = "Option::is_none")] + pub silence_duration_ms: Option, + /// Used only for server_vad mode. Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments. + #[serde(skip_serializing_if = "Option::is_none")] + pub threshold: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct SemanticVadConfig { + /// Whether or not to automatically generate a response when a VAD stop event occurs. + pub create_response: bool, + pub eagerness: SemanticVadEagerness, + /// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. + pub interrupt_response: bool, +} + +/// low will wait longer for the user to continue speaking, high will respond more quickly. auto is the default and is equivalent to medium. low, medium, and high have max timeouts of 8s, 4s, and 2s respectively. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum SemanticVadEagerness { + /// Equivalent to Medium. + Auto, + Low, + Medium, + High, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct NoiseReduction { + #[serde(rename = "type")] + pub reduction_type: NoiseReductionType, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum NoiseReductionType { + /// `near_field` is for close-talking microphones such as headphones + NearField, + /// `far_field` is for far-field microphones such as laptop or conference room microphones + FarField, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioOutput { + pub format: AudioFormat, + /// The speed of the model's spoken response as a multiple of the original speed. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. This value can only be changed in between model turns, not while a response is in progress. + /// This parameter is a post-processing adjustment to the audio after it is generated, it's also possible to prompt the model to speak faster or slower. + pub speed: f64, + /// The voice the model uses to respond. Voice cannot be changed during the session once the model has responded with audio at least once. + #[serde(skip_serializing_if = "Option::is_none")] + pub voice: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] pub enum AudioFormat { - #[serde(rename = "pcm16")] - PCM16, - #[serde(rename = "g711_ulaw")] + Pcm(AudioFormatDefinitionWithSampleRate), + Other(AudioFormatDefinition), +} + +/// This form of audio format definition is *only* used for the raw PCM format. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioFormatDefinitionWithSampleRate { + /// This must always be `24000` for PCM. + rate: i32, + /// Must be `Pcm`. + #[serde(rename = "type")] + audio_type: AudioFormatIdentifier, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioFormatDefinition { + #[serde(rename = "type")] + audio_type: AudioFormatIdentifier, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum AudioFormatIdentifier { + #[serde(rename = "audio/pcm")] + Pcm, + #[serde(rename = "audio/pcmu")] G711ULAW, - #[serde(rename = "g711_alaw")] + #[serde(rename = "audio/pcma")] G711ALAW, } From 1f605a5c611990c79b2f5372b965812024c28bef Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Thu, 11 Sep 2025 20:05:40 +0200 Subject: [PATCH 17/48] Wrap Session in enum so both regular realtime and transcription only sessions can be started --- src/realtime/client_event.rs | 4 ++-- src/realtime/types.rs | 25 +++++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/realtime/client_event.rs b/src/realtime/client_event.rs index 53805381..1b220527 100644 --- a/src/realtime/client_event.rs +++ b/src/realtime/client_event.rs @@ -1,7 +1,7 @@ use serde::{Deserialize, Serialize}; use tokio_tungstenite::tungstenite::Message; -use crate::realtime::types::{Item, Session}; +use crate::realtime::types::{Item, RealtimeSession, Session}; #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct SessionUpdate { @@ -58,7 +58,7 @@ pub struct ConversationItemDelete { pub struct ResponseCreate { #[serde(skip_serializing_if = "Option::is_none")] pub event_id: Option, - pub response: Option, + pub response: Option, // this will not work } #[derive(Debug, Serialize, Deserialize, Clone, Default)] diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 726a1430..4a5866f4 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -1,10 +1,27 @@ use serde::{Deserialize, Serialize}; +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase", tag = "type")] +pub enum Session { + Realtime(RealtimeSession), + Transcription(TranscriptionSession), +} +impl Default for Session { + fn default() -> Self { + Self::Realtime(Default::default()) + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct TranscriptionSession { + #[serde(skip_serializing_if = "Option::is_none")] + pub audio: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub include: Option>, +} + #[derive(Debug, Serialize, Deserialize, Clone, Default)] -pub struct Session { - /// Always `realtime` if specified. - #[serde(rename = "type", skip_serializing_if = "Option::is_none")] - pub session_type: Option, +pub struct RealtimeSession { #[serde(skip_serializing_if = "Option::is_none")] pub audio: Option, #[serde(skip_serializing_if = "Option::is_none")] From 12f71570239b579e57a3d5b11d46c0bc9922a240 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 12:19:15 +0200 Subject: [PATCH 18/48] nevermind, response.create does take something like a session config --- src/realtime/client_event.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realtime/client_event.rs b/src/realtime/client_event.rs index 1b220527..aa17e5d3 100644 --- a/src/realtime/client_event.rs +++ b/src/realtime/client_event.rs @@ -58,7 +58,7 @@ pub struct ConversationItemDelete { pub struct ResponseCreate { #[serde(skip_serializing_if = "Option::is_none")] pub event_id: Option, - pub response: Option, // this will not work + pub response: Option, } #[derive(Debug, Serialize, Deserialize, Clone, Default)] From c46f2a6f20b2c15de9265b7f50af7f9da002fbed Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 14:32:49 +0200 Subject: [PATCH 19/48] make ItemType PartialEq for filtering --- src/realtime/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 4a5866f4..fc4745b6 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -306,7 +306,7 @@ pub enum MaxOutputTokens { Inf, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "snake_case")] pub enum ItemType { Message, From 56d15ea256b0d1191439835043e42fe3b69af5c8 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 14:34:07 +0200 Subject: [PATCH 20/48] partialEq for ItemRole --- src/realtime/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index fc4745b6..2ed57ede 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -322,7 +322,7 @@ pub enum ItemStatus { Incomplete, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "lowercase")] pub enum ItemRole { User, From ac95318be48202f8f94135b8864693628d6343d2 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 14:34:39 +0200 Subject: [PATCH 21/48] more partialEqs --- src/realtime/types.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 2ed57ede..46649111 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -314,7 +314,7 @@ pub enum ItemType { FunctionCallOutput, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "snake_case")] pub enum ItemStatus { Completed, @@ -330,7 +330,7 @@ pub enum ItemRole { System, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "snake_case")] pub enum ItemContentType { InputText, From 297cebec98b9a604230bd37856aca2c368222c55 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 14:38:51 +0200 Subject: [PATCH 22/48] fix: make audio format components public --- src/realtime/types.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 46649111..aa73cb09 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -228,16 +228,16 @@ pub enum AudioFormat { #[derive(Debug, Serialize, Deserialize, Clone)] pub struct AudioFormatDefinitionWithSampleRate { /// This must always be `24000` for PCM. - rate: i32, + pub rate: i32, /// Must be `Pcm`. #[serde(rename = "type")] - audio_type: AudioFormatIdentifier, + pub audio_type: AudioFormatIdentifier, } #[derive(Debug, Serialize, Deserialize, Clone)] pub struct AudioFormatDefinition { #[serde(rename = "type")] - audio_type: AudioFormatIdentifier, + pub audio_type: AudioFormatIdentifier, } #[derive(Debug, Serialize, Deserialize, Clone)] From dac26030ee02420ee50b4b446664602e62cc90c6 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 15:44:50 +0200 Subject: [PATCH 23/48] make turn detection support both server (changed) and semantic vad mode --- src/realtime/types.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index aa73cb09..7a204e76 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -261,11 +261,8 @@ pub struct AudioTranscription { #[serde(tag = "type")] pub enum TurnDetection { #[serde(rename = "server_vad")] - ServerVAD { - threshold: f32, - prefix_padding_ms: u32, - silence_duration_ms: u32, - }, + ServerVAD(ServerVadConfig), + SemanticVAD(SemanticVadConfig), } #[derive(Debug, Serialize, Deserialize, Clone)] From 6deffe0fe149be84518647acddb3a2e6b50dd2a5 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:26:00 +0200 Subject: [PATCH 24/48] pass model information to realtime sip client --- src/realtime/api/sip.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/realtime/api/sip.rs b/src/realtime/api/sip.rs index 6e09ac57..e0795dcf 100644 --- a/src/realtime/api/sip.rs +++ b/src/realtime/api/sip.rs @@ -1,5 +1,7 @@ use serde::{Deserialize, Serialize}; +use crate::realtime::types::RealtimeModel; + use super::*; /// Intended for connecting to an already existing Realtime session spawned by accepting an incoming SIP call from e.g. Twilio. @@ -7,19 +9,26 @@ pub struct RealtimeSipClient { pub wss_url: String, pub api_key: String, pub call_id: String, + pub model: RealtimeModel, // contrary to the OpenAI tutorial, joining an SIP session without a `model` param causes an "invalid_request_error.missing_model" } impl RealtimeSipClient { - pub fn new(api_key: String, call_id: String) -> Self { + pub fn new(api_key: String, call_id: String, model: RealtimeModel) -> Self { let wss_url = std::env::var("WSS_URL").unwrap_or_else(|_| WSS_URL.to_owned()); - Self::new_with_endpoint(wss_url, api_key, call_id) + Self::new_with_endpoint(wss_url, api_key, call_id, model) } - pub fn new_with_endpoint(wss_url: String, api_key: String, call_id: String) -> Self { + pub fn new_with_endpoint( + wss_url: String, + api_key: String, + call_id: String, + model: RealtimeModel, + ) -> Self { Self { wss_url, api_key, call_id, + model, } } @@ -32,7 +41,17 @@ impl RealtimeSipClient { ), Box, > { - let url = format!("{}?callId={}", self.wss_url, self.call_id); + let model_slug = serde_json::to_string(&self.model).unwrap(); + let model_slug = model_slug + .strip_prefix("\"") + .unwrap() + .strip_suffix("\"") + .unwrap(); + + let url = format!( + "{}?callId={}&model={}", + self.wss_url, self.call_id, model_slug + ); let mut request = url.into_client_request()?; let api_key = self.api_key.clone(); request From 80b726fae10631a0ca4efeefc1a4af44c88dfb1c Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:33:19 +0200 Subject: [PATCH 25/48] reorder MaxOutputTokens variants so inf matches first --- src/realtime/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 7a204e76..ef29794c 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -298,9 +298,9 @@ pub enum FunctionType { #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum MaxOutputTokens { - Num(u16), #[serde(rename = "inf")] Inf, + Num(u16), } #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] From 8a7b0197ce4689c7e2ddc08f295c2a27d196055a Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:40:48 +0200 Subject: [PATCH 26/48] experiment fix: treat "inf" as string --- src/realtime/types.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index ef29794c..45d19584 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -298,8 +298,7 @@ pub enum FunctionType { #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum MaxOutputTokens { - #[serde(rename = "inf")] - Inf, + Inf(String), Num(u16), } From 147e5496881183c13cc19a208ef5cf168207783d Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:44:16 +0200 Subject: [PATCH 27/48] rename conversation.item.created -> .added --- src/realtime/server_event.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index b02a139d..bd2c0e7a 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -55,7 +55,7 @@ pub struct InputAudioBufferSpeechStopped { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ConversationItemCreated { +pub struct ConversationItemAdded { pub event_id: String, pub previous_item_id: Option, pub item: Item, @@ -261,8 +261,8 @@ pub enum ServerEvent { InputAudioBufferSpeechStarted(InputAudioBufferSpeechStarted), #[serde(rename = "input_audio_buffer.speech_stopped")] InputAudioBufferSpeechStopped(InputAudioBufferSpeechStopped), - #[serde(rename = "conversation.item.created")] - ConversationItemCreated(ConversationItemCreated), + #[serde(rename = "conversation.item.added")] + ConversationItemCreated(ConversationItemAdded), #[serde(rename = "conversation.item.input_audio_transcription.completed")] ConversationItemInputAudioTranscriptionCompleted( ConversationItemInputAudioTranscriptionCompleted, From de0704b6c29a018c8dc45e4b5ec331e78eabd313 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:46:54 +0200 Subject: [PATCH 28/48] more naming changes --- src/realtime/server_event.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index bd2c0e7a..1dcb8383 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -178,7 +178,7 @@ pub struct ResponseTextDone { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioTranscriptDelta { +pub struct ResponseOutputAudioTranscriptDelta { pub event_id: String, pub response_id: String, pub item_id: String, @@ -188,7 +188,7 @@ pub struct ResponseAudioTranscriptDelta { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioTranscriptDone { +pub struct ResponseOutputAudioTranscriptDone { pub event_id: String, pub response_id: String, pub item_id: String, @@ -198,7 +198,7 @@ pub struct ResponseAudioTranscriptDone { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioDelta { +pub struct ResponseOutputAudioDelta { pub event_id: String, pub response_id: String, pub item_id: String, @@ -208,7 +208,7 @@ pub struct ResponseAudioDelta { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioDone { +pub struct ResponseOutputAudioDone { pub event_id: String, pub response_id: String, pub item_id: String, @@ -262,7 +262,7 @@ pub enum ServerEvent { #[serde(rename = "input_audio_buffer.speech_stopped")] InputAudioBufferSpeechStopped(InputAudioBufferSpeechStopped), #[serde(rename = "conversation.item.added")] - ConversationItemCreated(ConversationItemAdded), + ConversationItemAdded(ConversationItemAdded), #[serde(rename = "conversation.item.input_audio_transcription.completed")] ConversationItemInputAudioTranscriptionCompleted( ConversationItemInputAudioTranscriptionCompleted, @@ -295,14 +295,14 @@ pub enum ServerEvent { ResponseTextDelta(ResponseTextDelta), #[serde(rename = "response.text.done")] ResponseTextDone(ResponseTextDone), - #[serde(rename = "response.audio_transcript.delta")] - ResponseAudioTranscriptDelta(ResponseAudioTranscriptDelta), - #[serde(rename = "response.audio_transcript.done")] - ResponseAudioTranscriptDone(ResponseAudioTranscriptDone), - #[serde(rename = "response.audio.delta")] - ResponseAudioDelta(ResponseAudioDelta), - #[serde(rename = "response.audio.done")] - ResponseAudioDone(ResponseAudioDone), + #[serde(rename = "response.output_audio_transcript.delta")] + ResponseOutputAudioTranscriptDelta(ResponseOutputAudioTranscriptDelta), + #[serde(rename = "response.output_audio_transcript.done")] + ResponseOutputAudioTranscriptDone(ResponseOutputAudioTranscriptDone), + #[serde(rename = "response.output_audio.delta")] + ResponseOutputAudioDelta(ResponseOutputAudioDelta), + #[serde(rename = "response.output_audio.done")] + ResponseOutputAudioDone(ResponseOutputAudioDone), #[serde(rename = "response.function_call_arguments.delta")] ResponseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDelta), #[serde(rename = "response.function_call_arguments.done")] From 459505c1b4e09692dc3f283988571914c545455c Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:48:47 +0200 Subject: [PATCH 29/48] rename audio => output_audio --- src/realtime/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 45d19584..7ff8f0cb 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -341,7 +341,7 @@ pub struct ItemContent { #[serde(skip_serializing_if = "Option::is_none")] pub text: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub audio: Option, + pub output_audio: Option, #[serde(skip_serializing_if = "Option::is_none")] pub transcript: Option, } From 6db1e23f760f2382b523fb050241fa810d45aeb1 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:55:12 +0200 Subject: [PATCH 30/48] more audio -> output_audio renaming --- src/realtime/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 7ff8f0cb..8e092fa0 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -332,7 +332,7 @@ pub enum ItemContentType { InputText, InputAudio, Text, - Audio, + OutputAudio, } #[derive(Debug, Serialize, Deserialize, Clone)] From 7f3b5b92d074f940e1092a4f4a54cfc0670a881b Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Fri, 12 Sep 2025 16:58:46 +0200 Subject: [PATCH 31/48] add conversation.item.done event --- src/realtime/server_event.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index 1dcb8383..92080218 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -61,6 +61,13 @@ pub struct ConversationItemAdded { pub item: Item, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ConversationItemDone { + pub event_id: String, + pub previous_item_id: Option, + pub item: Item, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ConversationItemInputAudioTranscriptionCompleted { pub event_id: String, @@ -273,6 +280,8 @@ pub enum ServerEvent { ConversationItemTruncated(ConversationItemTruncated), #[serde(rename = "conversation.item.deleted")] ConversationItemDeleted(ConversationItemDeleted), + #[serde(rename = "conversation.item.done")] + ConversationItemDone(ConversationItemDone), #[serde(rename = "output_audio_buffer.started")] OutputAudioBufferStarted(OutputAudioBufferStarted), #[serde(rename = "output_audio_buffer.stopped")] From e31fa9531768906a2a629e6b9e8aa80bbc4fb64a Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Mon, 15 Sep 2025 17:23:13 +0200 Subject: [PATCH 32/48] include all `session` fields in AcceptCallRequest --- src/realtime/types.rs | 7 ------- src/v1/realtime_calls.rs | 21 ++++++--------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 8e092fa0..d32b7942 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -82,13 +82,6 @@ pub enum OutputModality { Text, } -/// Enum representing the only possible value for `type` in the accept call payload. -#[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(rename_all = "lowercase")] -pub enum RealtimeCallSessionType { - Realtime, -} - #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "lowercase")] pub enum RealtimeVoice { diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index 1d8ba1d3..aae43339 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -1,25 +1,16 @@ use serde::{Deserialize, Serialize}; -use crate::realtime::types::{RealtimeCallSessionType, RealtimeModel}; +use crate::realtime::types::Session; /// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. /// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. +/// Per an OpenAI dev (https://community.openai.com/t/how-to-setup-transcription-on-realtime-api-with-sip/1355068/12) anything that can be passed to `session.update` over WSS can be passed to /accept, +/// as well as `model`, ordinarily reserved for `session.create`. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct AcceptCallRequest { - /// This is *always* `realtime`. Convenience constructor exposed to ensure this. - #[serde(rename = "type")] - pub session_type: RealtimeCallSessionType, - pub instructions: String, - pub model: RealtimeModel, -} -impl AcceptCallRequest { - pub fn new(instructions: impl Into, model: RealtimeModel) -> Self { - Self { - session_type: RealtimeCallSessionType::Realtime, - instructions: instructions.into(), - model, - } - } + /// The session must *always* be a `realtime` one. + #[serde(flatten)] + pub session: Session, } /// Used to redirect a call to another number. Per https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook the Tel-URI scheme may be used. From 9cdcb77c0c45114538b52c227a96ef89ed180856 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Mon, 15 Sep 2025 18:34:41 +0200 Subject: [PATCH 33/48] spacing + doc comm --- src/v1/realtime_calls.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs index aae43339..97887558 100644 --- a/src/v1/realtime_calls.rs +++ b/src/v1/realtime_calls.rs @@ -3,7 +3,8 @@ use serde::{Deserialize, Serialize}; use crate::realtime::types::Session; /// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. -/// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook and these may not be all the possible fields. +/// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook. +/// /// Per an OpenAI dev (https://community.openai.com/t/how-to-setup-transcription-on-realtime-api-with-sip/1355068/12) anything that can be passed to `session.update` over WSS can be passed to /accept, /// as well as `model`, ordinarily reserved for `session.create`. #[derive(Debug, Serialize, Deserialize, Clone)] From 8686f473f666b6684d534efe7ad6c8940483bdd9 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Mon, 15 Sep 2025 19:58:22 +0200 Subject: [PATCH 34/48] fix: fix call_id casing in RealtimeSipClient --- src/realtime/api/sip.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/realtime/api/sip.rs b/src/realtime/api/sip.rs index e0795dcf..f6a8607b 100644 --- a/src/realtime/api/sip.rs +++ b/src/realtime/api/sip.rs @@ -49,7 +49,7 @@ impl RealtimeSipClient { .unwrap(); let url = format!( - "{}?callId={}&model={}", + "{}?call_id={}&model={}", self.wss_url, self.call_id, model_slug ); let mut request = url.into_client_request()?; From 3d2aec3d709e4c9af7ad67b40a09fa346de8b535 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Mon, 15 Sep 2025 20:09:52 +0200 Subject: [PATCH 35/48] add delta item for input audio transcription --- src/realtime/server_event.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index 92080218..64a0a02f 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -76,6 +76,15 @@ pub struct ConversationItemInputAudioTranscriptionCompleted { pub transcript: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ConversationItemInputAudioTranscriptionDelta { + pub event_id: String, + pub item_id: String, + pub content_index: u32, + pub delta: String, + // todo: add logprobs support +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ConversationItemInputAudioTranscriptionFailed { pub event_id: String, @@ -274,6 +283,8 @@ pub enum ServerEvent { ConversationItemInputAudioTranscriptionCompleted( ConversationItemInputAudioTranscriptionCompleted, ), + #[serde(rename = "conversation.item.input_audio_transcription.delta")] + ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDelta), #[serde(rename = "conversation.item.input_audio_transcription.failed")] ConversationItemInputAudioTranscriptionFailed(ConversationItemInputAudioTranscriptionFailed), #[serde(rename = "conversation.item.truncated")] From d486710b4cbad4345073b7169b8dbe01e49062ce Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 16:13:43 +0200 Subject: [PATCH 36/48] feature: add MCP tool definitions --- src/realtime/types.rs | 76 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index d32b7942..548e943a 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use serde::{Deserialize, Serialize}; #[derive(Debug, Serialize, Deserialize, Clone)] @@ -260,13 +262,85 @@ pub enum TurnDetection { #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(tag = "type")] +#[serde(rename_all = "lowercase")] pub enum ToolDefinition { - #[serde(rename = "function")] Function { name: String, description: String, parameters: serde_json::Value, }, + Mcp { + server_label: String, + allowed_tools: McpAllowedTools, + /// An OAuth access token that can be used with a remote MCP server, either with a custom MCP server URL or a service connector. Your application must handle the OAuth authorization flow and provide the token here. + #[serde(skip_serializing_if = "Option::is_none")] + authorization: Option, + /// One of server_url or connector_id must be provided but not both. + #[serde(skip_serializing_if = "Option::is_none")] + connector_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + headers: Option>, + /// Specify which of the MCP server's tools require approval. + require_approval: McpApprovalSettings, + #[serde(skip_serializing_if = "Option::is_none")] + server_description: Option, + /// One of server_url or connector_id must be provided but not both. + #[serde(skip_serializing_if = "Option::is_none")] + server_url: Option, + }, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] +pub enum McpApprovalSettings { + Filter(McpApprovalFilter), + SinglePolicy(McpApprovalMode), +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpApprovalFilter { + always: McpFilterObject, + never: McpFilterObject, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum McpApprovalMode { + Always, + Never, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] +pub enum McpAllowedTools { + FilterObject(McpFilterObject), + ToolNames(Vec), +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpFilterObject { + read_only: bool, + tool_names: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum Connector { + #[serde(rename = "connector_dropbox")] + ConnectorDropbox, + #[serde(rename = "connector_gmail")] + ConnectorGmail, + #[serde(rename = "connector_googlecalendar")] + ConnectorGoogleCalendar, + #[serde(rename = "connector_googledrive")] + ConnectorGoogleDrive, + #[serde(rename = "connector_microsoftteams")] + ConnectorMicrosoftTeams, + #[serde(rename = "connector_outlookcalendar")] + ConnectorOutlookCalendar, + #[serde(rename = "connector_outlookemail")] + ConnectorOutlookEmail, + #[serde(rename = "connector_sharepoint")] + ConnectorSharepoint, } #[derive(Debug, Serialize, Deserialize, Clone)] From 4f6c023dd351748169b1a3de4cf8f1350e4d7ff0 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 16:15:39 +0200 Subject: [PATCH 37/48] add MCP tool choice type --- src/realtime/types.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 548e943a..80e528cb 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -354,6 +354,12 @@ pub enum ToolChoice { r#type: FunctionType, name: String, }, + #[serde(untagged)] + Mcp { + r#type: McpType, + name: String, + server_label: String, + }, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -362,6 +368,12 @@ pub enum FunctionType { Function, } +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum McpType { + Mcp, +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum MaxOutputTokens { From 4e4949002b5c7c2b24ca3f7c569d85ee626d0eee Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 16:19:58 +0200 Subject: [PATCH 38/48] add response mcp call arguments events --- src/realtime/server_event.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index 64a0a02f..be013453 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -252,6 +252,26 @@ pub struct ResponseFunctionCallArgumentsDone { pub arguments: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallArgumentsDelta { + pub event_id: String, + pub item_id: String, + #[serde(default)] + pub obfuscation: Option, + pub output_index: u32, + pub response_id: String, + pub delta: String, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallArgumentsDone { + pub event_id: String, + pub item_id: String, + pub output_index: u32, + pub response_id: String, + pub arguments: String, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct RateLimitsUpdated { pub event_id: String, @@ -327,6 +347,10 @@ pub enum ServerEvent { ResponseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDelta), #[serde(rename = "response.function_call_arguments.done")] ResponseFunctionCallArgumentsDone(ResponseFunctionCallArgumentsDone), + #[serde(rename = "response.mcp_call_arguments.delta")] + ResponseMcpCallArgumentsDelta(ResponseMcpCallArgumentsDelta), + #[serde(rename = "response.mcp_call_arguments.done")] + ResponseMcpCallArgumentsDone(ResponseMcpCallArgumentsDone), #[serde(rename = "rate_limits.updated")] RateLimitsUpdated(RateLimitsUpdated), } From 02c299487e2ca61d4792e0b98b5d110a80e1e65a Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 16:22:49 +0200 Subject: [PATCH 39/48] add response.mcp.* events --- src/realtime/server_event.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index be013453..51c34e33 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -272,6 +272,27 @@ pub struct ResponseMcpCallArgumentsDone { pub arguments: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallInProgress { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallCompleted { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallFailed { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct RateLimitsUpdated { pub event_id: String, @@ -351,6 +372,12 @@ pub enum ServerEvent { ResponseMcpCallArgumentsDelta(ResponseMcpCallArgumentsDelta), #[serde(rename = "response.mcp_call_arguments.done")] ResponseMcpCallArgumentsDone(ResponseMcpCallArgumentsDone), + #[serde(rename = "response.mcp_call.in_progress")] + ResponseMcpCallInProgress(ResponseMcpCallInProgress), + #[serde(rename = "response.mcp_call.completed")] + ResponseMcpCallCompleted(ResponseMcpCallCompleted), + #[serde(rename = "response.mcp_call.failed")] + ResponseMcpCallFailed(ResponseMcpCallFailed), #[serde(rename = "rate_limits.updated")] RateLimitsUpdated(RateLimitsUpdated), } From 447fb6417907b8084b2077c4d941bf1208f60437 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 16:25:32 +0200 Subject: [PATCH 40/48] feat: mcp list tools events --- src/realtime/server_event.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index 51c34e33..91a0fdc4 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -299,6 +299,27 @@ pub struct RateLimitsUpdated { pub rate_limits: Vec, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpListToolsInProgress { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpListToolsCompleted { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpListToolsFailed { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(tag = "type")] pub enum ServerEvent { @@ -380,4 +401,10 @@ pub enum ServerEvent { ResponseMcpCallFailed(ResponseMcpCallFailed), #[serde(rename = "rate_limits.updated")] RateLimitsUpdated(RateLimitsUpdated), + #[serde(rename = "mcp_list_tools.in_progress")] + McpListToolsInProgress(McpListToolsInProgress), + #[serde(rename = "mcp_list_tools.completed")] + McpListToolsCompleted(McpListToolsCompleted), + #[serde(rename = "mcp_list_tools.failed")] + McpListToolsFailed(McpListToolsFailed), } From 1b34f423ed8172e1fd0cd6a22ea3d3876c25d899 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 17:38:34 +0200 Subject: [PATCH 41/48] support MCP-related conversation items --- src/realtime/types.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 80e528cb..0826bc10 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -387,6 +387,10 @@ pub enum ItemType { Message, FunctionCall, FunctionCallOutput, + McpApprovalResponse, + McpListTools, + McpToolCall, + McpApprovalRequest, } #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] @@ -429,6 +433,9 @@ pub struct ItemContent { pub struct Item { #[serde(skip_serializing_if = "Option::is_none")] pub id: Option, + // Generic to all Item types: + #[serde(skip_serializing_if = "Option::is_none")] + pub previous_item_id: Option, #[serde(skip_serializing_if = "Option::is_none")] pub r#type: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -443,8 +450,30 @@ pub struct Item { pub name: Option, #[serde(skip_serializing_if = "Option::is_none")] pub arguments: Option, + // found both in function and MCP tool calls #[serde(skip_serializing_if = "Option::is_none")] pub output: Option, + // fields specific to approval request items: + #[serde(skip_serializing_if = "Option::is_none")] + pub approval_request_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub approve: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option, + // common to all MCP items: + #[serde(skip_serializing_if = "Option::is_none")] + pub server_label: Option, + // specific to MCP tool list: + // "name", server_label is already there + #[serde(skip_serializing_if = "Option::is_none")] + pub tools: Option>, + // to MCP tool call: + // arguments already there from the deprecated "functions" functionality + // id, name, server_label, approval_request_id, output already there + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, + // specific to MCP approval request: + // arguments, id, name, server_label all already there } impl TryFrom for Item { @@ -549,3 +578,12 @@ pub struct RateLimit { pub remaining: u32, pub reset_seconds: f32, } + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpToolListing { + pub input_schema: serde_json::Value, + pub name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub annotations: Option, + pub description: String, +} From 5ab96f696e7199f294bb88b955ec80c22f310d67 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 17:53:39 +0200 Subject: [PATCH 42/48] clarifying doc comments --- src/realtime/types.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 0826bc10..0819b5c9 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -306,7 +306,9 @@ pub struct McpApprovalFilter { #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(rename_all = "lowercase")] pub enum McpApprovalMode { + /// Always require approval Always, + /// Never require approval Never, } From 4de1b32a56b2aa5e31b39e8e7164fde8e0d9c70c Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 18:10:24 +0200 Subject: [PATCH 43/48] temporary debug eprintln --- src/v1/api.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/v1/api.rs b/src/v1/api.rs index 1defa82d..95911bec 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -207,6 +207,13 @@ impl OpenAIClient { if response.status().is_success() { let headers = response.headers().clone(); + // added for debugging because passing MCP tool definitions seems to silently fail: + #[cfg(debug_assertions)] + { + if let Ok(text) = &response.text().await { + eprintln!("Response body: {}", text); + } + } self.response_headers = Some(headers); Ok(()) } else { From 7f1e16f237a26df961868c4bb11e94dcfe22bbf2 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 19:21:20 +0200 Subject: [PATCH 44/48] removed useless debug statement --- src/v1/api.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/v1/api.rs b/src/v1/api.rs index 95911bec..1defa82d 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -207,13 +207,6 @@ impl OpenAIClient { if response.status().is_success() { let headers = response.headers().clone(); - // added for debugging because passing MCP tool definitions seems to silently fail: - #[cfg(debug_assertions)] - { - if let Ok(text) = &response.text().await { - eprintln!("Response body: {}", text); - } - } self.response_headers = Some(headers); Ok(()) } else { From 543749f8ace270d50c757ef3365f2ac38b58a930 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 19:42:03 +0200 Subject: [PATCH 45/48] supposedly this now works without passing model --- src/realtime/api/sip.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/realtime/api/sip.rs b/src/realtime/api/sip.rs index f6a8607b..beb8bd5f 100644 --- a/src/realtime/api/sip.rs +++ b/src/realtime/api/sip.rs @@ -48,10 +48,11 @@ impl RealtimeSipClient { .strip_suffix("\"") .unwrap(); - let url = format!( - "{}?call_id={}&model={}", - self.wss_url, self.call_id, model_slug - ); + let url = format!("{}?call_id={}", self.wss_url, self.call_id); + // let url = format!( + // "{}?call_id={}&model={}", + // self.wss_url, self.call_id, model_slug + // ); let mut request = url.into_client_request()?; let api_key = self.api_key.clone(); request From 917a3b6542bf0db095a128bdf9b401f18faa2139 Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 19:47:19 +0200 Subject: [PATCH 46/48] remove model param from realtime endpoint URL --- src/realtime/api/sip.rs | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/src/realtime/api/sip.rs b/src/realtime/api/sip.rs index beb8bd5f..066cf368 100644 --- a/src/realtime/api/sip.rs +++ b/src/realtime/api/sip.rs @@ -1,7 +1,5 @@ use serde::{Deserialize, Serialize}; -use crate::realtime::types::RealtimeModel; - use super::*; /// Intended for connecting to an already existing Realtime session spawned by accepting an incoming SIP call from e.g. Twilio. @@ -9,26 +7,19 @@ pub struct RealtimeSipClient { pub wss_url: String, pub api_key: String, pub call_id: String, - pub model: RealtimeModel, // contrary to the OpenAI tutorial, joining an SIP session without a `model` param causes an "invalid_request_error.missing_model" } impl RealtimeSipClient { - pub fn new(api_key: String, call_id: String, model: RealtimeModel) -> Self { + pub fn new(api_key: String, call_id: String) -> Self { let wss_url = std::env::var("WSS_URL").unwrap_or_else(|_| WSS_URL.to_owned()); - Self::new_with_endpoint(wss_url, api_key, call_id, model) + Self::new_with_endpoint(wss_url, api_key, call_id) } - pub fn new_with_endpoint( - wss_url: String, - api_key: String, - call_id: String, - model: RealtimeModel, - ) -> Self { + pub fn new_with_endpoint(wss_url: String, api_key: String, call_id: String) -> Self { Self { wss_url, api_key, call_id, - model, } } @@ -41,18 +32,7 @@ impl RealtimeSipClient { ), Box, > { - let model_slug = serde_json::to_string(&self.model).unwrap(); - let model_slug = model_slug - .strip_prefix("\"") - .unwrap() - .strip_suffix("\"") - .unwrap(); - let url = format!("{}?call_id={}", self.wss_url, self.call_id); - // let url = format!( - // "{}?call_id={}&model={}", - // self.wss_url, self.call_id, model_slug - // ); let mut request = url.into_client_request()?; let api_key = self.api_key.clone(); request From 5f30cc02efd1c383d0aa7c0fc69278000e628c8d Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 19:51:12 +0200 Subject: [PATCH 47/48] fix: remove output_index --- src/realtime/server_event.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index 91a0fdc4..1de493d2 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -303,21 +303,18 @@ pub struct RateLimitsUpdated { pub struct McpListToolsInProgress { pub event_id: String, pub item_id: String, - pub output_index: u32, } #[derive(Debug, Serialize, Deserialize, Clone)] pub struct McpListToolsCompleted { pub event_id: String, pub item_id: String, - pub output_index: u32, } #[derive(Debug, Serialize, Deserialize, Clone)] pub struct McpListToolsFailed { pub event_id: String, pub item_id: String, - pub output_index: u32, } #[derive(Debug, Serialize, Deserialize, Clone)] From 7702309dc078d76edb3624ccdd5439e0c58c5f8b Mon Sep 17 00:00:00 2001 From: janligudzinski Date: Wed, 17 Sep 2025 20:32:18 +0200 Subject: [PATCH 48/48] rename tool_call --- src/realtime/types.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/realtime/types.rs b/src/realtime/types.rs index 0819b5c9..7b386ba1 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -391,6 +391,7 @@ pub enum ItemType { FunctionCallOutput, McpApprovalResponse, McpListTools, + #[serde(rename = "mcp_call")] // not consistent with the docs McpToolCall, McpApprovalRequest, }