Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 82 additions & 15 deletions lib/hooks/use-browser-tts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,22 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) {
const [availableVoices, setAvailableVoices] = useState<SpeechSynthesisVoice[]>([]);
const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null);

/**
* Cancel+re-speak state for instant pause & resume-from-position.
*
* Approach (same pattern as PlaybackEngine):
* - pause(): call cancel() for instant silence; save text + last word boundary
* - resume(): re-speak text.slice(lastBoundaryIndex) with the same voice
*
* This avoids the ~300ms delay of speechSynthesis.pause() and enables
* resuming from approximate pause position rather than sentence start.
*/
const pausedTextRef = useRef<string | null>(null);
const pausedVoiceURIRef = useRef<string | undefined>(undefined);
const lastBoundaryIndexRef = useRef(0);
/** Flag to suppress onEnd/onError callbacks fired synchronously by cancel-for-pause */
const cancellingForPauseRef = useRef(false);

// Load available voices
useEffect(() => {
if (typeof window === 'undefined' || !window.speechSynthesis) {
Expand All @@ -59,13 +75,23 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) {
};
}, []);

const speak = useCallback(
(text: string, voiceURI?: string) => {
/**
* Internal speak helper — shared by public speak() and resume().
* @param isResume When true, suppresses the onStart callback to avoid
* duplicate side effects when resuming from pause.
*/
const speakInternal = useCallback(
(text: string, voiceURI?: string, isResume?: boolean) => {
if (typeof window === 'undefined' || !window.speechSynthesis) {
onError?.('浏览器不支持 Web Speech API');
return;
}

// Reset pause-cancel flag — new speech (from speak() or resume()) means
// any pending async onEnd/onError from a previous cancel-for-pause should
// no longer be suppressed (they've either already fired or are stale).
cancellingForPauseRef.current = false;

// Cancel any ongoing speech
window.speechSynthesis.cancel();

Expand All @@ -83,54 +109,95 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) {
}
}

// Track word boundaries for resume-from-position.
// Save charIndex + charLength (= end of word) so resume skips the
// word that was already spoken, rather than repeating it.
utterance.onboundary = (event) => {
if (event.name === 'word') {
lastBoundaryIndexRef.current = event.charIndex + (event.charLength ?? 0);
}
};

utterance.onstart = () => {
setIsSpeaking(true);
setIsPaused(false);
onStart?.();
if (!isResume) onStart?.();
};

utterance.onend = () => {
if (cancellingForPauseRef.current) return; // suppress — pause handler owns state
setIsSpeaking(false);
setIsPaused(false);
utteranceRef.current = null;
pausedTextRef.current = null;
onEnd?.();
};

utterance.onerror = (event) => {
if (cancellingForPauseRef.current) return; // suppress — cancel-for-pause fires 'canceled'
setIsSpeaking(false);
setIsPaused(false);
utteranceRef.current = null;
pausedTextRef.current = null;
onError?.(event.error);
};

utterance.onpause = () => {
setIsPaused(true);
};

utterance.onresume = () => {
setIsPaused(false);
};

utteranceRef.current = utterance;
// Save full text + voice for potential pause+re-speak
pausedTextRef.current = text;
pausedVoiceURIRef.current = voiceURI;
lastBoundaryIndexRef.current = 0;
window.speechSynthesis.speak(utterance);
},
[rate, pitch, volume, lang, availableVoices, onStart, onEnd, onError],
);

const speak = useCallback(
(text: string, voiceURI?: string) => {
speakInternal(text, voiceURI, false);
},
[speakInternal],
);

const pause = useCallback(() => {
if (typeof window !== 'undefined' && window.speechSynthesis) {
window.speechSynthesis.pause();
// Cancel+re-speak pattern: cancel() is instant — no ~300ms delay
// like speechSynthesis.pause(). Text + boundary position are already
// saved by speakInternal, so resume() can re-speak from there.
cancellingForPauseRef.current = true;
window.speechSynthesis.cancel();
// Keep cancellingForPauseRef = true for the entire pause period.
// Chrome fires onend/onerror asynchronously after cancel(), so we
// must NOT reset the flag here. It is reset in speakInternal() when
// new speech starts (from speak() or resume()).
setIsPaused(true);
}
}, []);

const resume = useCallback(() => {
if (typeof window !== 'undefined' && window.speechSynthesis) {
window.speechSynthesis.resume();
if (typeof window !== 'undefined' && window.speechSynthesis && pausedTextRef.current) {
const fullText = pausedTextRef.current;
const voiceURI = pausedVoiceURIRef.current;
// Slice from last word boundary for resume-from-position
const remaining = fullText.slice(lastBoundaryIndexRef.current);
pausedTextRef.current = null;
setIsPaused(false);
if (remaining.trim()) {
speakInternal(remaining, voiceURI, true);
} else {
// Nothing left to speak — treat as natural end
setIsSpeaking(false);
utteranceRef.current = null;
onEnd?.();
}
}
}, []);
}, [speakInternal, onEnd]);

const cancel = useCallback(() => {
if (typeof window !== 'undefined' && window.speechSynthesis) {
pausedTextRef.current = null;
cancellingForPauseRef.current = false;
lastBoundaryIndexRef.current = 0;
window.speechSynthesis.cancel();
setIsSpeaking(false);
setIsPaused(false);
Expand Down
41 changes: 35 additions & 6 deletions lib/hooks/use-discussion-tts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,26 +36,40 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
const queueRef = useRef<QueueItem[]>([]);
const isPlayingRef = useRef(false);
const pausedRef = useRef(false);
/** Tracks which TTS provider is currently speaking (for pause/resume delegation) */
const currentProviderRef = useRef<TTSProviderId | null>(null);
const segmentDoneCounterRef = useRef(0);
const abortControllerRef = useRef<AbortController | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const onAudioStateChangeRef = useRef(onAudioStateChange);
onAudioStateChangeRef.current = onAudioStateChange;
const processQueueRef = useRef<() => void>(() => {});

const { speak: browserSpeak, cancel: browserCancel } = useBrowserTTS({
const {
speak: browserSpeak,
pause: browserPause,
resume: browserResume,
cancel: browserCancel,
} = useBrowserTTS({
rate: ttsSpeed,
onEnd: () => {
isPlayingRef.current = false;
segmentDoneCounterRef.current++;
onAudioStateChangeRef.current?.(null, 'idle');
processQueueRef.current();
// Don't advance queue while paused — resume() will kick-start it
if (!pausedRef.current) {
processQueueRef.current();
}
},
});
const browserCancelRef = useRef(browserCancel);
browserCancelRef.current = browserCancel;
const browserSpeakRef = useRef(browserSpeak);
browserSpeakRef.current = browserSpeak;
const browserPauseRef = useRef(browserPause);
browserPauseRef.current = browserPause;
const browserResumeRef = useRef(browserResume);
browserResumeRef.current = browserResume;

// Build agent index map for deterministic voice resolution
const agentIndexMap = useRef<Map<string, number>>(new Map());
Expand Down Expand Up @@ -98,6 +112,7 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
);

const processQueue = useCallback(async () => {
if (pausedRef.current) return; // Don't advance while paused
if (isPlayingRef.current || queueRef.current.length === 0) return;
if (!enabled || ttsMuted) {
queueRef.current = [];
Expand All @@ -109,12 +124,14 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus

// Browser TTS
if (item.providerId === 'browser-native-tts') {
currentProviderRef.current = item.providerId;
onAudioStateChangeRef.current?.(item.agentId, 'playing');
browserSpeakRef.current(item.text, item.voiceId);
return;
}

// Server TTS — use the item's provider, not the global one
currentProviderRef.current = item.providerId;
onAudioStateChangeRef.current?.(item.agentId, 'generating');
const controller = new AbortController();
abortControllerRef.current = controller;
Expand Down Expand Up @@ -198,6 +215,8 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
);

const cleanup = useCallback(() => {
pausedRef.current = false;
currentProviderRef.current = null;
abortControllerRef.current?.abort();
abortControllerRef.current = null;
if (audioRef.current) {
Expand All @@ -208,22 +227,32 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
browserCancelRef.current();
queueRef.current = [];
isPlayingRef.current = false;
pausedRef.current = false;
segmentDoneCounterRef.current = 0;
onAudioStateChangeRef.current?.(null, 'idle');
}, []);

/** Pause TTS audio (browser-native or server). Does NOT stop the SSE stream. */
const pause = useCallback(() => {
if (pausedRef.current) return;
pausedRef.current = true;
if (audioRef.current && !audioRef.current.paused) {
if (currentProviderRef.current === 'browser-native-tts') {
browserPauseRef.current();
} else if (audioRef.current && !audioRef.current.paused) {
audioRef.current.pause();
}
}, []);

/** Resume TTS audio. If the previous utterance already ended while paused, advance the queue. */
const resume = useCallback(() => {
if (!pausedRef.current) return;
pausedRef.current = false;
if (audioRef.current && audioRef.current.paused && audioRef.current.src) {
audioRef.current.play().catch(() => {});
if (currentProviderRef.current === 'browser-native-tts') {
browserResumeRef.current();
} else if (audioRef.current && audioRef.current.paused) {
audioRef.current.play();
} else if (!isPlayingRef.current) {
// Audio finished while paused — kick-start the queue
processQueueRef.current();
}
}, []);

Expand Down
34 changes: 33 additions & 1 deletion lib/playback/engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ export class PlaybackEngine {
private browserTTSChunks: string[] = []; // sentence-level chunks for sequential playback
private browserTTSChunkIndex: number = 0; // current chunk being spoken
private browserTTSPausedChunks: string[] = []; // remaining chunks saved on pause (for cancel+re-speak)
private browserTTSBoundaryIndex: number = 0; // charIndex within current chunk from onboundary (for resume-from-position)
private browserTTSCurrentLang: string = ''; // resolved lang of current chunk (preserved across pause+resume)
private speechTimerRemaining: number = 0; // remaining ms (set on pause)

constructor(
Expand Down Expand Up @@ -167,7 +169,14 @@ export class PlaybackEngine {
// Cancel+re-speak pattern: save remaining chunks for resume.
// speechSynthesis.pause()/resume() is broken on Firefox, so we
// cancel now and re-speak from current chunk onward on resume.
this.browserTTSPausedChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex);
// Slice the current chunk from the last word boundary so resume
// continues from approximate pause position, not chunk start (#250).
const currentChunk = this.browserTTSChunks[this.browserTTSChunkIndex];
const remainingText = currentChunk?.slice(this.browserTTSBoundaryIndex) ?? '';
const futureChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex + 1);
this.browserTTSPausedChunks = remainingText.trim()
? [remainingText, ...futureChunks]
: futureChunks;
window.speechSynthesis?.cancel();
// Note: cancel fires onerror('canceled'), which we ignore (see playBrowserTTSChunk)
} else if (this.audioPlayer.isPlaying()) {
Expand Down Expand Up @@ -617,6 +626,8 @@ export class PlaybackEngine {
private playBrowserTTS(speechAction: SpeechAction): void {
this.browserTTSChunks = this.splitIntoChunks(speechAction.text);
this.browserTTSChunkIndex = 0;
this.browserTTSBoundaryIndex = 0;
this.browserTTSCurrentLang = '';
this.browserTTSPausedChunks = [];
this.browserTTSActive = true;
this.playBrowserTTSChunk();
Expand All @@ -635,8 +646,24 @@ export class PlaybackEngine {

const settings = useSettingsStore.getState();
const chunkText = this.browserTTSChunks[this.browserTTSChunkIndex];
this.browserTTSBoundaryIndex = 0; // reset for new chunk
const utterance = new SpeechSynthesisUtterance(chunkText);

// If we have a saved lang from a previous chunk (e.g., from pause+resume),
// use it to prevent voice switching at language boundaries.
if (this.browserTTSCurrentLang) {
utterance.lang = this.browserTTSCurrentLang;
}

// Track word boundaries for resume-from-position (#250).
// Save charIndex + charLength (= end of word) so resume skips the
// word that was already spoken, rather than repeating it.
utterance.onboundary = (event) => {
if (event.name === 'word') {
this.browserTTSBoundaryIndex = event.charIndex + (event.charLength ?? 0);
}
};

// Apply settings
const speed = this.callbacks.getPlaybackSpeed?.() ?? 1;
utterance.rate = (settings.ttsSpeed ?? 1) * speed;
Expand All @@ -662,6 +689,9 @@ export class PlaybackEngine {
(chunkText.match(/[\u4e00-\u9fff\u3400-\u4dbf]/g) || []).length / chunkText.length;
utterance.lang = cjkRatio > CJK_LANG_THRESHOLD ? 'zh-CN' : 'en-US';
}
// Save resolved lang so sliced resume chunks use the same language
// (prevents voice switching when pause point is at a language boundary)
this.browserTTSCurrentLang = utterance.lang;

utterance.onend = () => {
this.browserTTSChunkIndex++;
Expand Down Expand Up @@ -730,6 +760,8 @@ export class PlaybackEngine {
this.browserTTSActive = false;
this.browserTTSChunks = [];
this.browserTTSChunkIndex = 0;
this.browserTTSBoundaryIndex = 0;
this.browserTTSCurrentLang = '';
this.browserTTSPausedChunks = [];
window.speechSynthesis?.cancel();
}
Expand Down
Loading