From e8df54a0590d5132f518545436b8d9e0c24e35cd Mon Sep 17 00:00:00 2001 From: Naomi Date: Thu, 31 Oct 2024 16:35:58 -0700 Subject: [PATCH 1/6] feat: add article integrating-stt-tts-deepgram --- ...ng-stt-tts-deepgram-1301691162435784711.md | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 articles/en/community/integrating-stt-tts-deepgram-1301691162435784711.md diff --git a/articles/en/community/integrating-stt-tts-deepgram-1301691162435784711.md b/articles/en/community/integrating-stt-tts-deepgram-1301691162435784711.md new file mode 100644 index 0000000..062ee6d --- /dev/null +++ b/articles/en/community/integrating-stt-tts-deepgram-1301691162435784711.md @@ -0,0 +1,59 @@ +# Integrating STT and TTS with Deepgram for Low Latency + +## Integrating STT and TTS for LLM Applications + +When integrating Speech-to-Text (STT) and Text-to-Speech (TTS) capabilities using Deepgram with applications that also include language learning models (LLMs), there are several architectural considerations to ensure a smooth and low-latency experience. + +### Maintaining Connections for Low Latency + +For STT functionality, maintaining an open connection is generally recommended. Deepgram’s live transcription WebSocket API is designed to efficiently handle continuous streams of data. By keeping the connection open, interim results can be returned, providing a real-time transcription experience that balances speed and accuracy. Pooling connections and reusing them as necessary can mitigate the concerns around having too many open connections or delays in establishing new ones. + +In Node.js or Python, setting up a persistent WebSocket connection might look like this: + +**Node.js Example:** +```javascript +const WebSocket = require('ws'); + +const ws = new WebSocket('wss://api.deepgram.com/v1/listen', { + headers: { + Authorization: 'Token YOUR_DEEPGRAM_API_KEY' + } +}); + +ws.on('message', function incoming(data) { + console.log(JSON.parse(data)); +}); +``` + +**Python Example:** +```python +import websocket + +ws = websocket.WebSocket() +ws.connect('wss://api.deepgram.com/v1/listen', header=["Authorization: Token YOUR_DEEPGRAM_API_KEY"]) + +def on_message(ws, message): + print(message) + +ws.run_forever(on_message=on_message) +``` + +### Transition Between TTS and STT + +An important aspect of the architecture is handling the transition between TTS output and restarting STT input. This can be achieved by implementing state management that detects when the TTS playback concludes. Using an audio API, TTS playback can be monitored, and upon completion, a state change can trigger the resumption of STT. + +To improve the flow, a local voice activity detection (VAD) system can be used, allowing the user to interrupt the TTS playback. By detecting speech, the system can pause or cancel the ongoing TTS and process the new input immediately. + +### Overall Architecture Considerations + +For optimal performance, the architecture typically sees the client establish a WebSocket connection directly with Deepgram’s API, managing the sessions for both STT and TTS. Though anyone considering this must weigh the security against the latency benefits. Keeping API keys client-side might introduce vulnerabilities unless mitigated by the use of short-lived tokens or other security measures. + +For deploying the backend, consider the complexity of managing API key lifecycle and access control to ensure security while maintaining a seamless experience. + +## Conclusion + +Building a responsive system with STT and TTS requires careful consideration of connection management and state transitions between speech recognition and generation processes. Using Deepgram's real-time WebSocket APIs, you can maintain open connections for better responsiveness and integrate VAD to smooth out the interaction flow between your voice interface components. + +### References +- [Deepgram Live Transcription Docs](https://developers.deepgram.com/docs/getting-started-with-live-streaming-audio) +- [Deepgram Text-to-Speech Docs](https://developers.deepgram.com/docs/tts-websocket) \ No newline at end of file From 9dedc31de8ed29727e86465ce76c111b693e787e Mon Sep 17 00:00:00 2001 From: Naomi Date: Thu, 31 Oct 2024 16:36:28 -0700 Subject: [PATCH 2/6] feat: add code samples integrating-stt-tts-deepgram --- .../integrating-stt-tts-deepgram.rs | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 code/community/1301691162435784711/integrating-stt-tts-deepgram.rs diff --git a/code/community/1301691162435784711/integrating-stt-tts-deepgram.rs b/code/community/1301691162435784711/integrating-stt-tts-deepgram.rs new file mode 100644 index 0000000..c3e457d --- /dev/null +++ b/code/community/1301691162435784711/integrating-stt-tts-deepgram.rs @@ -0,0 +1,46 @@ +use std::env; +use std::error::Error; +use websocket::{ClientBuilder, Message}; +use std::thread; +use std::io::Write; + +fn main() -> Result<(), Box> { + let stt_url = "wss://api.deepgram.com/v1/listen"; + let tts_url = "wss://api.deepgram.com/v1/speak"; + let api_key = env::var("DEEPGRAM_API_KEY")?; + + // Connect to the STT WebSocket + let stt_client = ClientBuilder::new(stt_url)? + .add_protocol("rust-websocket") + .connect_secure(Some(websocket::sync::NativeTlsSettings::default()))?; + + stt_client.send_message(&Message::text("{\"content-type\": \"audio/wav\", \"interim_results\": true}"))?; + + // Keep the connection open while processing + let (mut receiver, mut sender) = stt_client.split()?; + + thread::spawn(move || { + for message in receiver.incoming_messages() { + match message { + Ok(message) => match message { + Message::Text(txt) => println!("Received: {}", txt), + _ => {} + }, + Err(e) => eprintln!("Error: {:#?}", e), + } + } + }); + + // Example TTS management (assuming TTS state is managed similarly) + let tts_client = ClientBuilder::new(tts_url)? + .add_protocol("rust-websocket") + .connect_secure(Some(websocket::sync::NativeTlsSettings::default()))?; + + // For TTS, send a message to play text + let tts_text = "Hello, this is a test of Text-to-Speech API."; + tts_client.send_message(&Message::text(tts_text))?; + + std::io::stdout().flush()?; + + Ok(()) +} \ No newline at end of file From 87d544af7fbe59b80fcb3796db20142ca313da8a Mon Sep 17 00:00:00 2001 From: Naomi Date: Thu, 31 Oct 2024 16:36:29 -0700 Subject: [PATCH 3/6] feat: add code samples integrating-stt-tts-deepgram --- .../integrating-stt-tts-deepgram.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 code/community/1301691162435784711/integrating-stt-tts-deepgram.py diff --git a/code/community/1301691162435784711/integrating-stt-tts-deepgram.py b/code/community/1301691162435784711/integrating-stt-tts-deepgram.py new file mode 100644 index 0000000..ba5de17 --- /dev/null +++ b/code/community/1301691162435784711/integrating-stt-tts-deepgram.py @@ -0,0 +1,34 @@ +import os +import asyncio +import websockets + +async def connect_stt(): + url = "wss://api.deepgram.com/v1/listen" + api_key = os.environ['DEEPGRAM_API_KEY'] + headers = {"Authorization": f"Token {api_key}"} + + async with websockets.connect(url, extra_headers=headers) as websocket: + await websocket.send("{\"content-type\": \"audio/wav\", \"interim_results\": true}") + while True: + response = await websocket.recv() + print(f"STT Received: {response}") + +async def connect_tts(): + url = "wss://api.deepgram.com/v1/speak" + api_key = os.environ['DEEPGRAM_API_KEY'] + headers = {"Authorization": f"Token {api_key}"} + + async with websockets.connect(url, extra_headers=headers) as websocket: + tts_text = "Hello, this is a test message for TTS." + await websocket.send(tts_text) + response = await websocket.recv() + print(f"TTS Received: {response}") + +async def main(): + await asyncio.gather( + connect_stt(), + connect_tts() + ) + +if __name__ == '__main__': + asyncio.run(main()) \ No newline at end of file From b412808240b470c24a33945add7ad161725df7e9 Mon Sep 17 00:00:00 2001 From: Naomi Date: Thu, 31 Oct 2024 16:36:30 -0700 Subject: [PATCH 4/6] feat: add code samples integrating-stt-tts-deepgram --- .../integrating-stt-tts-deepgram.js | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 code/community/1301691162435784711/integrating-stt-tts-deepgram.js diff --git a/code/community/1301691162435784711/integrating-stt-tts-deepgram.js b/code/community/1301691162435784711/integrating-stt-tts-deepgram.js new file mode 100644 index 0000000..4878242 --- /dev/null +++ b/code/community/1301691162435784711/integrating-stt-tts-deepgram.js @@ -0,0 +1,33 @@ +const WebSocket = require('ws'); +require('dotenv').config(); + +const DEEPGRAM_API_KEY = process.env.DEEPGRAM_API_KEY; + +// Connect to Deepgram STT WebSocket +const sttSocket = new WebSocket('wss://api.deepgram.com/v1/listen', { + headers: { Authorization: `Token ${DEEPGRAM_API_KEY}` } +}); + +sttSocket.on('open', () => { + console.log('STT WebSocket Connection Opened'); + sttSocket.send(JSON.stringify({ "content-type": "audio/wav", "interim_results": true })); +}); + +sttSocket.on('message', (data) => { + console.log('STT Received:', data); +}); + +// Connect to Deepgram TTS WebSocket +const ttsSocket = new WebSocket('wss://api.deepgram.com/v1/speak', { + headers: { Authorization: `Token ${DEEPGRAM_API_KEY}` } +}); + +ttsSocket.on('open', () => { + console.log('TTS WebSocket Connection Opened'); + const ttsText = "Hello, this is a test using Deepgram's TTS."; + ttsSocket.send(ttsText); +}); + +ttsSocket.on('message', (data) => { + console.log('TTS Received Audio:', data); +}); \ No newline at end of file From 3a73e426ef110262be7f9a7a60d9c15b95c6fe5a Mon Sep 17 00:00:00 2001 From: Naomi Date: Thu, 31 Oct 2024 16:36:31 -0700 Subject: [PATCH 5/6] feat: add code samples integrating-stt-tts-deepgram --- .../integrating-stt-tts-deepgram.cs | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 code/community/1301691162435784711/integrating-stt-tts-deepgram.cs diff --git a/code/community/1301691162435784711/integrating-stt-tts-deepgram.cs b/code/community/1301691162435784711/integrating-stt-tts-deepgram.cs new file mode 100644 index 0000000..09ea0fa --- /dev/null +++ b/code/community/1301691162435784711/integrating-stt-tts-deepgram.cs @@ -0,0 +1,61 @@ +using System; +using System.Net.WebSockets; +using System.Threading; +using System.Threading.Tasks; +using System.Text; +using System.Collections.Specialized; +using System.Web; + +class Program +{ + private static async Task ConnectSttAsync() + { + using (ClientWebSocket socket = new ClientWebSocket()) + { + var uri = new Uri("wss://api.deepgram.com/v1/listen"); + socket.Options.SetRequestHeader("Authorization", "Token " + Environment.GetEnvironmentVariable("DEEPGRAM_API_KEY")); + + await socket.ConnectAsync(uri, CancellationToken.None); + + Console.WriteLine("WebSocket STT Connection Established"); + + var message = Encoding.UTF8.GetBytes("{\"content-type\": \"audio/wav\", \"interim_results\": true}"); + await socket.SendAsync(new ArraySegment(message), WebSocketMessageType.Text, true, CancellationToken.None); + + var buffer = new byte[1024]; + while (socket.State == WebSocketState.Open) + { + var result = await socket.ReceiveAsync(new ArraySegment(buffer), CancellationToken.None); + Console.WriteLine("STT Received: " + Encoding.UTF8.GetString(buffer, 0, result.Count)); + } + } + } + + private static async Task ConnectTtsAsync() + { + using (ClientWebSocket socket = new ClientWebSocket()) + { + var uri = new Uri("wss://api.deepgram.com/v1/speak"); + socket.Options.SetRequestHeader("Authorization", "Token " + Environment.GetEnvironmentVariable("DEEPGRAM_API_KEY")); + + await socket.ConnectAsync(uri, CancellationToken.None); + + Console.WriteLine("WebSocket TTS Connection Established"); + + var message = Encoding.UTF8.GetBytes("Hello, this is a Deepgram TTS test."); + await socket.SendAsync(new ArraySegment(message), WebSocketMessageType.Text, true, CancellationToken.None); + + var buffer = new byte[1024]; + while (socket.State == WebSocketState.Open) + { + var result = await socket.ReceiveAsync(new ArraySegment(buffer), CancellationToken.None); + Console.WriteLine("TTS Received: " + Encoding.UTF8.GetString(buffer, 0, result.Count)); + } + } + } + + public static async Task Main(string[] args) + { + await Task.WhenAll(ConnectSttAsync(), ConnectTtsAsync()); + } +} \ No newline at end of file From c8e19c3faeb3353eca657b3bdfb151e509ad03ca Mon Sep 17 00:00:00 2001 From: Naomi Date: Thu, 31 Oct 2024 16:36:32 -0700 Subject: [PATCH 6/6] feat: add code samples integrating-stt-tts-deepgram --- .../integrating-stt-tts-deepgram.go | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 code/community/1301691162435784711/integrating-stt-tts-deepgram.go diff --git a/code/community/1301691162435784711/integrating-stt-tts-deepgram.go b/code/community/1301691162435784711/integrating-stt-tts-deepgram.go new file mode 100644 index 0000000..7c08841 --- /dev/null +++ b/code/community/1301691162435784711/integrating-stt-tts-deepgram.go @@ -0,0 +1,54 @@ +package main + +import ( + "os" + "fmt" + "log" + "golang.org/x/net/websocket" +) + +func main() { + sttUrl := "wss://api.deepgram.com/v1/listen" + ttsUrl := "wss://api.deepgram.com/v1/speak" + apiKey := os.Getenv("DEEPGRAM_API_KEY") + + headers := websocket.Config{ + Location: websocket.ParseLocation(sttUrl), + Origin: websocket.ParseLocation("https://api.deepgram.com"), + Header: websocket.Header{ + "Authorization": {"Token " + apiKey}, + }, + } + + // STT connection + sttConn, err := websocket.DialConfig(&headers) + if err != nil { + log.Fatal(err) + } + defer sttConn.Close() + + fmt.Println("STT WebSocket Connection Opened") + + sttMessage := "{\"content-type\": \"audio/wav\", \"interim_results\": true}" + websocket.Message.Send(sttConn, sttMessage) + + var sttReply string + websocket.Message.Receive(sttConn, &sttReply) + fmt.Println("STT Received:", sttReply) + + // TTS connection + ttsConn, err := websocket.DialConfig(&headers) + if err != nil { + log.Fatal(err) + } + defer ttsConn.Close() + + fmt.Println("TTS WebSocket Connection Opened") + + ttsMessage := "Hello, this is a text-to-speech demo." + websocket.Message.Send(ttsConn, ttsMessage) + + var ttsReply string + websocket.Message.Receive(ttsConn, &ttsReply) + fmt.Println("TTS Received:", ttsReply) +} \ No newline at end of file