THU-MAIC · YizukiAme · Mar 24, 2026 · Mar 24, 2026
diff --git a/lib/hooks/use-browser-tts.ts b/lib/hooks/use-browser-tts.ts
@@ -34,6 +34,22 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) {
   const [availableVoices, setAvailableVoices] = useState<SpeechSynthesisVoice[]>([]);
   const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null);
 
+  /**
+   * Cancel+re-speak state for instant pause & resume-from-position.
+   *
+   * Approach (same pattern as PlaybackEngine):
+   * - pause():  call cancel() for instant silence; save text + last word boundary
+   * - resume(): re-speak text.slice(lastBoundaryIndex) with the same voice
+   *
+   * This avoids the ~300ms delay of speechSynthesis.pause() and enables
+   * resuming from approximate pause position rather than sentence start.
+   */
+  const pausedTextRef = useRef<string | null>(null);
+  const pausedVoiceURIRef = useRef<string | undefined>(undefined);
+  const lastBoundaryIndexRef = useRef(0);
+  /** Flag to suppress onEnd/onError callbacks fired synchronously by cancel-for-pause */
+  const cancellingForPauseRef = useRef(false);
+
   // Load available voices
   useEffect(() => {
     if (typeof window === 'undefined' || !window.speechSynthesis) {
@@ -59,13 +75,23 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) {
     };
   }, []);
 
-  const speak = useCallback(
-    (text: string, voiceURI?: string) => {
+  /**
+   * Internal speak helper — shared by public speak() and resume().
+   * @param isResume  When true, suppresses the onStart callback to avoid
+   *                  duplicate side effects when resuming from pause.
+   */
+  const speakInternal = useCallback(
+    (text: string, voiceURI?: string, isResume?: boolean) => {
       if (typeof window === 'undefined' || !window.speechSynthesis) {
         onError?.('浏览器不支持 Web Speech API');
         return;
       }
 
+      // Reset pause-cancel flag — new speech (from speak() or resume()) means
+      // any pending async onEnd/onError from a previous cancel-for-pause should
+      // no longer be suppressed (they've either already fired or are stale).
+      cancellingForPauseRef.current = false;
+
       // Cancel any ongoing speech
       window.speechSynthesis.cancel();
 
@@ -83,54 +109,95 @@ export function useBrowserTTS(options: UseBrowserTTSOptions = {}) {
         }
       }
 
+      // Track word boundaries for resume-from-position.
+      // Save charIndex + charLength (= end of word) so resume skips the
+      // word that was already spoken, rather than repeating it.
+      utterance.onboundary = (event) => {
+        if (event.name === 'word') {
+          lastBoundaryIndexRef.current = event.charIndex + (event.charLength ?? 0);
+        }
+      };
+
       utterance.onstart = () => {
         setIsSpeaking(true);
         setIsPaused(false);
-        onStart?.();
+        if (!isResume) onStart?.();
       };
 
       utterance.onend = () => {
+        if (cancellingForPauseRef.current) return; // suppress — pause handler owns state
         setIsSpeaking(false);
         setIsPaused(false);
         utteranceRef.current = null;
+        pausedTextRef.current = null;
         onEnd?.();
       };
 
       utterance.onerror = (event) => {
+        if (cancellingForPauseRef.current) return; // suppress — cancel-for-pause fires 'canceled'
         setIsSpeaking(false);
         setIsPaused(false);
         utteranceRef.current = null;
+        pausedTextRef.current = null;
         onError?.(event.error);
       };
 
-      utterance.onpause = () => {
-        setIsPaused(true);
-      };
-
-      utterance.onresume = () => {
-        setIsPaused(false);
-      };
-
       utteranceRef.current = utterance;
+      // Save full text + voice for potential pause+re-speak
+      pausedTextRef.current = text;
+      pausedVoiceURIRef.current = voiceURI;
+      lastBoundaryIndexRef.current = 0;
       window.speechSynthesis.speak(utterance);
     },
     [rate, pitch, volume, lang, availableVoices, onStart, onEnd, onError],
   );
 
+  const speak = useCallback(
+    (text: string, voiceURI?: string) => {
+      speakInternal(text, voiceURI, false);
+    },
+    [speakInternal],
+  );
+
   const pause = useCallback(() => {
     if (typeof window !== 'undefined' && window.speechSynthesis) {
-      window.speechSynthesis.pause();
+      // Cancel+re-speak pattern: cancel() is instant — no ~300ms delay
+      // like speechSynthesis.pause(). Text + boundary position are already
+      // saved by speakInternal, so resume() can re-speak from there.
+      cancellingForPauseRef.current = true;
+      window.speechSynthesis.cancel();
+      // Keep cancellingForPauseRef = true for the entire pause period.
+      // Chrome fires onend/onerror asynchronously after cancel(), so we
+      // must NOT reset the flag here. It is reset in speakInternal() when
+      // new speech starts (from speak() or resume()).
+      setIsPaused(true);
     }
   }, []);
 
   const resume = useCallback(() => {
-    if (typeof window !== 'undefined' && window.speechSynthesis) {
-      window.speechSynthesis.resume();
+    if (typeof window !== 'undefined' && window.speechSynthesis && pausedTextRef.current) {
+      const fullText = pausedTextRef.current;
+      const voiceURI = pausedVoiceURIRef.current;
+      // Slice from last word boundary for resume-from-position
+      const remaining = fullText.slice(lastBoundaryIndexRef.current);
+      pausedTextRef.current = null;
+      setIsPaused(false);
+      if (remaining.trim()) {
+        speakInternal(remaining, voiceURI, true);
+      } else {
+        // Nothing left to speak — treat as natural end
+        setIsSpeaking(false);
+        utteranceRef.current = null;
+        onEnd?.();
+      }
     }
-  }, []);
+  }, [speakInternal, onEnd]);
 
   const cancel = useCallback(() => {
     if (typeof window !== 'undefined' && window.speechSynthesis) {
+      pausedTextRef.current = null;
+      cancellingForPauseRef.current = false;
+      lastBoundaryIndexRef.current = 0;
       window.speechSynthesis.cancel();
       setIsSpeaking(false);
       setIsPaused(false);

diff --git a/lib/hooks/use-discussion-tts.ts b/lib/hooks/use-discussion-tts.ts
@@ -36,26 +36,40 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
   const queueRef = useRef<QueueItem[]>([]);
   const isPlayingRef = useRef(false);
   const pausedRef = useRef(false);
+  /** Tracks which TTS provider is currently speaking (for pause/resume delegation) */
+  const currentProviderRef = useRef<TTSProviderId | null>(null);
   const segmentDoneCounterRef = useRef(0);
   const abortControllerRef = useRef<AbortController | null>(null);
   const audioRef = useRef<HTMLAudioElement | null>(null);
   const onAudioStateChangeRef = useRef(onAudioStateChange);
   onAudioStateChangeRef.current = onAudioStateChange;
   const processQueueRef = useRef<() => void>(() => {});
 
-  const { speak: browserSpeak, cancel: browserCancel } = useBrowserTTS({
+  const {
+    speak: browserSpeak,
+    pause: browserPause,
+    resume: browserResume,
+    cancel: browserCancel,
+  } = useBrowserTTS({
     rate: ttsSpeed,
     onEnd: () => {
       isPlayingRef.current = false;
       segmentDoneCounterRef.current++;
       onAudioStateChangeRef.current?.(null, 'idle');
-      processQueueRef.current();
+      // Don't advance queue while paused — resume() will kick-start it
+      if (!pausedRef.current) {
+        processQueueRef.current();
+      }
     },
   });
   const browserCancelRef = useRef(browserCancel);
   browserCancelRef.current = browserCancel;
   const browserSpeakRef = useRef(browserSpeak);
   browserSpeakRef.current = browserSpeak;
+  const browserPauseRef = useRef(browserPause);
+  browserPauseRef.current = browserPause;
+  const browserResumeRef = useRef(browserResume);
+  browserResumeRef.current = browserResume;
 
   // Build agent index map for deterministic voice resolution
   const agentIndexMap = useRef<Map<string, number>>(new Map());
@@ -98,6 +112,7 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
   );
 
   const processQueue = useCallback(async () => {
+    if (pausedRef.current) return; // Don't advance while paused
     if (isPlayingRef.current || queueRef.current.length === 0) return;
     if (!enabled || ttsMuted) {
       queueRef.current = [];
@@ -109,12 +124,14 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
 
     // Browser TTS
     if (item.providerId === 'browser-native-tts') {
+      currentProviderRef.current = item.providerId;
       onAudioStateChangeRef.current?.(item.agentId, 'playing');
       browserSpeakRef.current(item.text, item.voiceId);
       return;
     }
 
     // Server TTS — use the item's provider, not the global one
+    currentProviderRef.current = item.providerId;
     onAudioStateChangeRef.current?.(item.agentId, 'generating');
     const controller = new AbortController();
     abortControllerRef.current = controller;
@@ -198,6 +215,8 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
   );
 
   const cleanup = useCallback(() => {
+    pausedRef.current = false;
+    currentProviderRef.current = null;
     abortControllerRef.current?.abort();
     abortControllerRef.current = null;
     if (audioRef.current) {
@@ -208,22 +227,32 @@ export function useDiscussionTTS({ enabled, agents, onAudioStateChange }: Discus
     browserCancelRef.current();
     queueRef.current = [];
     isPlayingRef.current = false;
-    pausedRef.current = false;
     segmentDoneCounterRef.current = 0;
     onAudioStateChangeRef.current?.(null, 'idle');
   }, []);
 
+  /** Pause TTS audio (browser-native or server). Does NOT stop the SSE stream. */
   const pause = useCallback(() => {
+    if (pausedRef.current) return;
     pausedRef.current = true;
-    if (audioRef.current && !audioRef.current.paused) {
+    if (currentProviderRef.current === 'browser-native-tts') {
+      browserPauseRef.current();
+    } else if (audioRef.current && !audioRef.current.paused) {
       audioRef.current.pause();
     }
   }, []);
 
+  /** Resume TTS audio. If the previous utterance already ended while paused, advance the queue. */
   const resume = useCallback(() => {
+    if (!pausedRef.current) return;
     pausedRef.current = false;
-    if (audioRef.current && audioRef.current.paused && audioRef.current.src) {
-      audioRef.current.play().catch(() => {});
+    if (currentProviderRef.current === 'browser-native-tts') {
+      browserResumeRef.current();
+    } else if (audioRef.current && audioRef.current.paused) {
+      audioRef.current.play();
+    } else if (!isPlayingRef.current) {
+      // Audio finished while paused — kick-start the queue
+      processQueueRef.current();
     }
   }, []);
 

diff --git a/lib/playback/engine.ts b/lib/playback/engine.ts
@@ -81,6 +81,8 @@ export class PlaybackEngine {
   private browserTTSChunks: string[] = []; // sentence-level chunks for sequential playback
   private browserTTSChunkIndex: number = 0; // current chunk being spoken
   private browserTTSPausedChunks: string[] = []; // remaining chunks saved on pause (for cancel+re-speak)
+  private browserTTSBoundaryIndex: number = 0; // charIndex within current chunk from onboundary (for resume-from-position)
+  private browserTTSCurrentLang: string = ''; // resolved lang of current chunk (preserved across pause+resume)
   private speechTimerRemaining: number = 0; // remaining ms (set on pause)
 
   constructor(
@@ -167,7 +169,14 @@ export class PlaybackEngine {
           // Cancel+re-speak pattern: save remaining chunks for resume.
           // speechSynthesis.pause()/resume() is broken on Firefox, so we
           // cancel now and re-speak from current chunk onward on resume.
-          this.browserTTSPausedChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex);
+          // Slice the current chunk from the last word boundary so resume
+          // continues from approximate pause position, not chunk start (#250).
+          const currentChunk = this.browserTTSChunks[this.browserTTSChunkIndex];
+          const remainingText = currentChunk?.slice(this.browserTTSBoundaryIndex) ?? '';
+          const futureChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex + 1);
+          this.browserTTSPausedChunks = remainingText.trim()
+            ? [remainingText, ...futureChunks]
+            : futureChunks;
           window.speechSynthesis?.cancel();
           // Note: cancel fires onerror('canceled'), which we ignore (see playBrowserTTSChunk)
         } else if (this.audioPlayer.isPlaying()) {
@@ -617,6 +626,8 @@ export class PlaybackEngine {
   private playBrowserTTS(speechAction: SpeechAction): void {
     this.browserTTSChunks = this.splitIntoChunks(speechAction.text);
     this.browserTTSChunkIndex = 0;
+    this.browserTTSBoundaryIndex = 0;
+    this.browserTTSCurrentLang = '';
     this.browserTTSPausedChunks = [];
     this.browserTTSActive = true;
     this.playBrowserTTSChunk();
@@ -635,8 +646,24 @@ export class PlaybackEngine {
 
     const settings = useSettingsStore.getState();
     const chunkText = this.browserTTSChunks[this.browserTTSChunkIndex];
+    this.browserTTSBoundaryIndex = 0; // reset for new chunk
     const utterance = new SpeechSynthesisUtterance(chunkText);
 
+    // If we have a saved lang from a previous chunk (e.g., from pause+resume),
+    // use it to prevent voice switching at language boundaries.
+    if (this.browserTTSCurrentLang) {
+      utterance.lang = this.browserTTSCurrentLang;
+    }
+
+    // Track word boundaries for resume-from-position (#250).
+    // Save charIndex + charLength (= end of word) so resume skips the
+    // word that was already spoken, rather than repeating it.
+    utterance.onboundary = (event) => {
+      if (event.name === 'word') {
+        this.browserTTSBoundaryIndex = event.charIndex + (event.charLength ?? 0);
+      }
+    };
+
     // Apply settings
     const speed = this.callbacks.getPlaybackSpeed?.() ?? 1;
     utterance.rate = (settings.ttsSpeed ?? 1) * speed;
@@ -662,6 +689,9 @@ export class PlaybackEngine {
         (chunkText.match(/[\u4e00-\u9fff\u3400-\u4dbf]/g) || []).length / chunkText.length;
       utterance.lang = cjkRatio > CJK_LANG_THRESHOLD ? 'zh-CN' : 'en-US';
     }
+    // Save resolved lang so sliced resume chunks use the same language
+    // (prevents voice switching when pause point is at a language boundary)
+    this.browserTTSCurrentLang = utterance.lang;
 
     utterance.onend = () => {
       this.browserTTSChunkIndex++;
@@ -730,6 +760,8 @@ export class PlaybackEngine {
       this.browserTTSActive = false;
       this.browserTTSChunks = [];
       this.browserTTSChunkIndex = 0;
+      this.browserTTSBoundaryIndex = 0;
+      this.browserTTSCurrentLang = '';
       this.browserTTSPausedChunks = [];
       window.speechSynthesis?.cancel();
     }