diff --git a/lib/hooks/use-browser-tts.ts b/lib/hooks/use-browser-tts.ts index 119575fd..159d0581 100644 --- a/lib/hooks/use-browser-tts.ts +++ b/lib/hooks/use-browser-tts.ts @@ -34,6 +34,22 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) { const [availableVoices, setAvailableVoices] = useState([]); const utteranceRef = useRef(null); + /** + * Cancel+re-speak state for instant pause & resume-from-position. + * + * Approach (same pattern as PlaybackEngine): + * - pause(): call cancel() for instant silence; save text + last word boundary + * - resume(): re-speak text.slice(lastBoundaryIndex) with the same voice + * + * This avoids the ~300ms delay of speechSynthesis.pause() and enables + * resuming from approximate pause position rather than sentence start. + */ + const pausedTextRef = useRef(null); + const pausedVoiceURIRef = useRef(undefined); + const lastBoundaryIndexRef = useRef(0); + /** Flag to suppress onEnd/onError callbacks fired synchronously by cancel-for-pause */ + const cancellingForPauseRef = useRef(false); + // Load available voices useEffect(() => { if (typeof window === 'undefined' || !window.speechSynthesis) { @@ -59,13 +75,23 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) { }; }, []); - const speak = useCallback( - (text: string, voiceURI?: string) => { + /** + * Internal speak helper — shared by public speak() and resume(). + * @param isResume When true, suppresses the onStart callback to avoid + * duplicate side effects when resuming from pause. + */ + const speakInternal = useCallback( + (text: string, voiceURI?: string, isResume?: boolean) => { if (typeof window === 'undefined' || !window.speechSynthesis) { onError?.('浏览器不支持 Web Speech API'); return; } + // Reset pause-cancel flag — new speech (from speak() or resume()) means + // any pending async onEnd/onError from a previous cancel-for-pause should + // no longer be suppressed (they've either already fired or are stale). + cancellingForPauseRef.current = false; + // Cancel any ongoing speech window.speechSynthesis.cancel(); @@ -83,54 +109,95 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) { } } + // Track word boundaries for resume-from-position. + // Save charIndex + charLength (= end of word) so resume skips the + // word that was already spoken, rather than repeating it. + utterance.onboundary = (event) => { + if (event.name === 'word') { + lastBoundaryIndexRef.current = event.charIndex + (event.charLength ?? 0); + } + }; + utterance.onstart = () => { setIsSpeaking(true); setIsPaused(false); - onStart?.(); + if (!isResume) onStart?.(); }; utterance.onend = () => { + if (cancellingForPauseRef.current) return; // suppress — pause handler owns state setIsSpeaking(false); setIsPaused(false); utteranceRef.current = null; + pausedTextRef.current = null; onEnd?.(); }; utterance.onerror = (event) => { + if (cancellingForPauseRef.current) return; // suppress — cancel-for-pause fires 'canceled' setIsSpeaking(false); setIsPaused(false); utteranceRef.current = null; + pausedTextRef.current = null; onError?.(event.error); }; - utterance.onpause = () => { - setIsPaused(true); - }; - - utterance.onresume = () => { - setIsPaused(false); - }; - utteranceRef.current = utterance; + // Save full text + voice for potential pause+re-speak + pausedTextRef.current = text; + pausedVoiceURIRef.current = voiceURI; + lastBoundaryIndexRef.current = 0; window.speechSynthesis.speak(utterance); }, [rate, pitch, volume, lang, availableVoices, onStart, onEnd, onError], ); + const speak = useCallback( + (text: string, voiceURI?: string) => { + speakInternal(text, voiceURI, false); + }, + [speakInternal], + ); + const pause = useCallback(() => { if (typeof window !== 'undefined' && window.speechSynthesis) { - window.speechSynthesis.pause(); + // Cancel+re-speak pattern: cancel() is instant — no ~300ms delay + // like speechSynthesis.pause(). Text + boundary position are already + // saved by speakInternal, so resume() can re-speak from there. + cancellingForPauseRef.current = true; + window.speechSynthesis.cancel(); + // Keep cancellingForPauseRef = true for the entire pause period. + // Chrome fires onend/onerror asynchronously after cancel(), so we + // must NOT reset the flag here. It is reset in speakInternal() when + // new speech starts (from speak() or resume()). + setIsPaused(true); } }, []); const resume = useCallback(() => { - if (typeof window !== 'undefined' && window.speechSynthesis) { - window.speechSynthesis.resume(); + if (typeof window !== 'undefined' && window.speechSynthesis && pausedTextRef.current) { + const fullText = pausedTextRef.current; + const voiceURI = pausedVoiceURIRef.current; + // Slice from last word boundary for resume-from-position + const remaining = fullText.slice(lastBoundaryIndexRef.current); + pausedTextRef.current = null; + setIsPaused(false); + if (remaining.trim()) { + speakInternal(remaining, voiceURI, true); + } else { + // Nothing left to speak — treat as natural end + setIsSpeaking(false); + utteranceRef.current = null; + onEnd?.(); + } } - }, []); + }, [speakInternal, onEnd]); const cancel = useCallback(() => { if (typeof window !== 'undefined' && window.speechSynthesis) { + pausedTextRef.current = null; + cancellingForPauseRef.current = false; + lastBoundaryIndexRef.current = 0; window.speechSynthesis.cancel(); setIsSpeaking(false); setIsPaused(false); diff --git a/lib/hooks/use-discussion-tts.ts b/lib/hooks/use-discussion-tts.ts index 074a60d9..e512a2fd 100644 --- a/lib/hooks/use-discussion-tts.ts +++ b/lib/hooks/use-discussion-tts.ts @@ -36,6 +36,8 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus const queueRef = useRef([]); const isPlayingRef = useRef(false); const pausedRef = useRef(false); + /** Tracks which TTS provider is currently speaking (for pause/resume delegation) */ + const currentProviderRef = useRef(null); const segmentDoneCounterRef = useRef(0); const abortControllerRef = useRef(null); const audioRef = useRef(null); @@ -43,19 +45,31 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus onAudioStateChangeRef.current = onAudioStateChange; const processQueueRef = useRef<() => void>(() => {}); - const { speak: browserSpeak, cancel: browserCancel } = useBrowserTTS({ + const { + speak: browserSpeak, + pause: browserPause, + resume: browserResume, + cancel: browserCancel, + } = useBrowserTTS({ rate: ttsSpeed, onEnd: () => { isPlayingRef.current = false; segmentDoneCounterRef.current++; onAudioStateChangeRef.current?.(null, 'idle'); - processQueueRef.current(); + // Don't advance queue while paused — resume() will kick-start it + if (!pausedRef.current) { + processQueueRef.current(); + } }, }); const browserCancelRef = useRef(browserCancel); browserCancelRef.current = browserCancel; const browserSpeakRef = useRef(browserSpeak); browserSpeakRef.current = browserSpeak; + const browserPauseRef = useRef(browserPause); + browserPauseRef.current = browserPause; + const browserResumeRef = useRef(browserResume); + browserResumeRef.current = browserResume; // Build agent index map for deterministic voice resolution const agentIndexMap = useRef>(new Map()); @@ -98,6 +112,7 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus ); const processQueue = useCallback(async () => { + if (pausedRef.current) return; // Don't advance while paused if (isPlayingRef.current || queueRef.current.length === 0) return; if (!enabled || ttsMuted) { queueRef.current = []; @@ -109,12 +124,14 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus // Browser TTS if (item.providerId === 'browser-native-tts') { + currentProviderRef.current = item.providerId; onAudioStateChangeRef.current?.(item.agentId, 'playing'); browserSpeakRef.current(item.text, item.voiceId); return; } // Server TTS — use the item's provider, not the global one + currentProviderRef.current = item.providerId; onAudioStateChangeRef.current?.(item.agentId, 'generating'); const controller = new AbortController(); abortControllerRef.current = controller; @@ -198,6 +215,8 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus ); const cleanup = useCallback(() => { + pausedRef.current = false; + currentProviderRef.current = null; abortControllerRef.current?.abort(); abortControllerRef.current = null; if (audioRef.current) { @@ -208,22 +227,32 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus browserCancelRef.current(); queueRef.current = []; isPlayingRef.current = false; - pausedRef.current = false; segmentDoneCounterRef.current = 0; onAudioStateChangeRef.current?.(null, 'idle'); }, []); + /** Pause TTS audio (browser-native or server). Does NOT stop the SSE stream. */ const pause = useCallback(() => { + if (pausedRef.current) return; pausedRef.current = true; - if (audioRef.current && !audioRef.current.paused) { + if (currentProviderRef.current === 'browser-native-tts') { + browserPauseRef.current(); + } else if (audioRef.current && !audioRef.current.paused) { audioRef.current.pause(); } }, []); + /** Resume TTS audio. If the previous utterance already ended while paused, advance the queue. */ const resume = useCallback(() => { + if (!pausedRef.current) return; pausedRef.current = false; - if (audioRef.current && audioRef.current.paused && audioRef.current.src) { - audioRef.current.play().catch(() => {}); + if (currentProviderRef.current === 'browser-native-tts') { + browserResumeRef.current(); + } else if (audioRef.current && audioRef.current.paused) { + audioRef.current.play(); + } else if (!isPlayingRef.current) { + // Audio finished while paused — kick-start the queue + processQueueRef.current(); } }, []); diff --git a/lib/playback/engine.ts b/lib/playback/engine.ts index e13e60b1..a5ed0c82 100644 --- a/lib/playback/engine.ts +++ b/lib/playback/engine.ts @@ -81,6 +81,8 @@ export class PlaybackEngine { private browserTTSChunks: string[] = []; // sentence-level chunks for sequential playback private browserTTSChunkIndex: number = 0; // current chunk being spoken private browserTTSPausedChunks: string[] = []; // remaining chunks saved on pause (for cancel+re-speak) + private browserTTSBoundaryIndex: number = 0; // charIndex within current chunk from onboundary (for resume-from-position) + private browserTTSCurrentLang: string = ''; // resolved lang of current chunk (preserved across pause+resume) private speechTimerRemaining: number = 0; // remaining ms (set on pause) constructor( @@ -167,7 +169,14 @@ export class PlaybackEngine { // Cancel+re-speak pattern: save remaining chunks for resume. // speechSynthesis.pause()/resume() is broken on Firefox, so we // cancel now and re-speak from current chunk onward on resume. - this.browserTTSPausedChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex); + // Slice the current chunk from the last word boundary so resume + // continues from approximate pause position, not chunk start (#250). + const currentChunk = this.browserTTSChunks[this.browserTTSChunkIndex]; + const remainingText = currentChunk?.slice(this.browserTTSBoundaryIndex) ?? ''; + const futureChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex + 1); + this.browserTTSPausedChunks = remainingText.trim() + ? [remainingText, ...futureChunks] + : futureChunks; window.speechSynthesis?.cancel(); // Note: cancel fires onerror('canceled'), which we ignore (see playBrowserTTSChunk) } else if (this.audioPlayer.isPlaying()) { @@ -617,6 +626,8 @@ export class PlaybackEngine { private playBrowserTTS(speechAction: SpeechAction): void { this.browserTTSChunks = this.splitIntoChunks(speechAction.text); this.browserTTSChunkIndex = 0; + this.browserTTSBoundaryIndex = 0; + this.browserTTSCurrentLang = ''; this.browserTTSPausedChunks = []; this.browserTTSActive = true; this.playBrowserTTSChunk(); @@ -635,8 +646,24 @@ export class PlaybackEngine { const settings = useSettingsStore.getState(); const chunkText = this.browserTTSChunks[this.browserTTSChunkIndex]; + this.browserTTSBoundaryIndex = 0; // reset for new chunk const utterance = new SpeechSynthesisUtterance(chunkText); + // If we have a saved lang from a previous chunk (e.g., from pause+resume), + // use it to prevent voice switching at language boundaries. + if (this.browserTTSCurrentLang) { + utterance.lang = this.browserTTSCurrentLang; + } + + // Track word boundaries for resume-from-position (#250). + // Save charIndex + charLength (= end of word) so resume skips the + // word that was already spoken, rather than repeating it. + utterance.onboundary = (event) => { + if (event.name === 'word') { + this.browserTTSBoundaryIndex = event.charIndex + (event.charLength ?? 0); + } + }; + // Apply settings const speed = this.callbacks.getPlaybackSpeed?.() ?? 1; utterance.rate = (settings.ttsSpeed ?? 1) * speed; @@ -662,6 +689,9 @@ export class PlaybackEngine { (chunkText.match(/[\u4e00-\u9fff\u3400-\u4dbf]/g) || []).length / chunkText.length; utterance.lang = cjkRatio > CJK_LANG_THRESHOLD ? 'zh-CN' : 'en-US'; } + // Save resolved lang so sliced resume chunks use the same language + // (prevents voice switching when pause point is at a language boundary) + this.browserTTSCurrentLang = utterance.lang; utterance.onend = () => { this.browserTTSChunkIndex++; @@ -730,6 +760,8 @@ export class PlaybackEngine { this.browserTTSActive = false; this.browserTTSChunks = []; this.browserTTSChunkIndex = 0; + this.browserTTSBoundaryIndex = 0; + this.browserTTSCurrentLang = ''; this.browserTTSPausedChunks = []; window.speechSynthesis?.cancel(); }