tinyhumansai · M3gA-Mind · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -21,6 +21,7 @@ import PersistRehydrationScreen from './components/PersistRehydrationScreen';
 import SecurityBanner from './components/SecurityBanner';
 import GlobalUpsellBanner from './components/upsell/GlobalUpsellBanner';
 import AppWalkthrough from './components/walkthrough/AppWalkthrough';
+import { useVoiceSpeak } from './features/human/voice/useVoiceSpeak';
 import { MascotFrameProducer } from './features/meet/MascotFrameProducer';
 import { useNotchBootSync } from './hooks/useNotchBootSync';
 import { I18nProvider } from './lib/i18n/I18nContext';
@@ -155,6 +156,9 @@ function AppShellDesktop() {
   const location = useLocation();
   const navigate = useNavigate();
   const { snapshot, isBootstrapping } = useCoreState();
+  // Play proactive assistant speech (e.g. spoken approval prompts for
+  // voice-initiated turns — Phase 4 of #3148) app-wide.
+  useVoiceSpeak();
   const activeAccountId = useAppSelector(state => state.accounts.activeAccountId);
   // On /accounts, only the agent view keeps the tab bar + its reserved
   // bottom padding. Any other selected "app" (e.g. WhatsApp) takes the

@@ -0,0 +1,67 @@
+import { renderHook } from '@testing-library/react';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { useVoiceSpeak } from './useVoiceSpeak';
+
+const hoisted = vi.hoisted(() => ({
+  onMock: vi.fn<(event: string, cb: (...args: unknown[]) => void) => void>(),
+  offMock: vi.fn(),
+  synthMock: vi.fn(),
+  playMock: vi.fn(),
+  stopMock: vi.fn(),
+}));
+
+vi.mock('../../../services/socketService', () => ({
+  socketService: { on: hoisted.onMock, off: hoisted.offMock },
+}));
+vi.mock('./ttsClient', () => ({ synthesizeSpeech: hoisted.synthMock }));
+vi.mock('./audioPlayer', () => ({ playBase64Audio: hoisted.playMock, swallowAudioStop: vi.fn() }));
+
+/** Grab the `voice:speak` handler the hook registered with socketService. */
+function speakHandler(): (...args: unknown[]) => void {
+  const call = hoisted.onMock.mock.calls.find(([event]) => event === 'voice:speak');
+  if (!call) throw new Error('useVoiceSpeak did not subscribe to voice:speak');
+  return call[1];
+}
+
+describe('useVoiceSpeak', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    hoisted.synthMock.mockResolvedValue({
+      audio_base64: 'AAA=',
+      audio_mime: 'audio/mpeg',
+      visemes: [],
+    });
+    hoisted.playMock.mockResolvedValue({ ended: Promise.resolve(), stop: hoisted.stopMock });
+  });
+
+  it('synthesizes and plays the spoken prompt on voice:speak', async () => {
+    renderHook(() => useVoiceSpeak());
+    speakHandler()({ text: 'Post to Slack. Say yes to confirm.', source: 'approval' });
+
+    await vi.waitFor(() => expect(hoisted.playMock).toHaveBeenCalledTimes(1));
+    expect(hoisted.synthMock).toHaveBeenCalledWith('Post to Slack. Say yes to confirm.');
+    expect(hoisted.playMock).toHaveBeenCalledWith('AAA=', 'audio/mpeg', expect.any(Object));
+  });
+
+  it('ignores an empty/whitespace prompt without synthesizing', async () => {
+    renderHook(() => useVoiceSpeak());
+    speakHandler()({ text: '   ' });
+    await Promise.resolve();
+    expect(hoisted.synthMock).not.toHaveBeenCalled();
+    expect(hoisted.playMock).not.toHaveBeenCalled();
+  });
+
+  it('ignores a malformed payload', async () => {
+    renderHook(() => useVoiceSpeak());
+    speakHandler()({ notText: true });
+    await Promise.resolve();
+    expect(hoisted.synthMock).not.toHaveBeenCalled();
+  });
+
+  it('unsubscribes and stops playback on unmount', () => {
+    const { unmount } = renderHook(() => useVoiceSpeak());
+    unmount();
+    expect(hoisted.offMock).toHaveBeenCalledWith('voice:speak', expect.any(Function));
+  });
+});
@@ -0,0 +1,72 @@
+import debug from 'debug';
+import { useEffect, useRef } from 'react';
+
+import { socketService } from '../../../services/socketService';
+import { type PlaybackHandle, playBase64Audio, swallowAudioStop } from './audioPlayer';
+import { synthesizeSpeech } from './ttsClient';
+
+const log = debug('human:voice-speak');
+
+/** Hard cap on a single spoken prompt, guarding against runaway TTS. */
+const MAX_SPEAK_MS = 20_000;
+
+/** Payload of the core `voice:speak` socket event (mirrors `SpeakRequest`). */
+interface SpeakPayload {
+  text: string;
+  source?: string | null;
+}
+
+function isSpeakPayload(value: unknown): value is SpeakPayload {
+  return (
+    typeof value === 'object' &&
+    value !== null &&
+    typeof (value as { text?: unknown }).text === 'string'
+  );
+}
+
+/**
+ * Play proactive assistant speech requested by the core via `voice:speak`.
+ *
+ * Today's only producer is the voice-native approval surface: when a sensitive
+ * action is parked for approval during a voice-initiated turn, the core asks the
+ * assistant to speak the confirmation aloud so a hands-free user can answer
+ * "yes"/"no" by voice (Phase 4 of #3148). Mounted once, app-wide, so the prompt
+ * is heard even when the mascot view isn't open — it synthesizes through the
+ * same TTS path the mascot uses and plays the returned audio directly.
+ */
+export function useVoiceSpeak(): void {
+  const handleRef = useRef<PlaybackHandle | null>(null);
+
+  useEffect(() => {
+    const onSpeak = (...args: unknown[]): void => {
+      const payload = args[0];
+      if (!isSpeakPayload(payload)) return;
+      const text = payload.text.trim();
+      if (!text) return;
+      log('voice:speak source=%s chars=%d', payload.source ?? 'unknown', text.length);
+
+      void (async () => {
+        try {
+          const { audio_base64: audioBase64, audio_mime: audioMime } = await synthesizeSpeech(text);
+          if (!audioBase64) return;
+          // Stop any in-flight prompt before starting the next one.
+          handleRef.current?.stop();
+          const handle = await playBase64Audio(audioBase64, audioMime || 'audio/mpeg', {
+            maxDurationMs: MAX_SPEAK_MS,
+          });
+          handleRef.current = handle;
+          handle.ended.catch(swallowAudioStop);
+        } catch (err) {
+          log('voice:speak playback failed: %o', err);
+        }
+      })();
+    };
+
+    socketService.on('voice:speak', onSpeak);
+    return () => {
+      socketService.off('voice:speak', onSpeak);
+      handleRef.current?.stop();
+      handleRef.current = null;
+    };
+  }, []);
+}
@@ -293,7 +293,9 @@ const Conversations = ({
   // timer's reference point.
   const sendingThreadIdRef = useRef<string | null>(null);
   // Ref so the mount-time dictation event handler can call the latest send fn.
-  const handleSendMessageRef = useRef<((text?: string) => Promise<void>) | null>(null);
+  const handleSendMessageRef = useRef<
+    ((text?: string, opts?: { voice?: boolean }) => Promise<void>) | null
+  >(null);
   // Previous inference status for the sending thread; lets the rearm effect
   // distinguish "status was just cleared (chat_done / chat_error)" from
   // "status was never set yet (in-flight turn pre-status)".
@@ -438,10 +440,12 @@ const Conversations = ({
 
       customEvent.preventDefault();
 
-      // When autoSend is set (hotkey dictation), dispatch the transcript directly
-      // to the agent without going through the text composer.
+      // When autoSend is set (hotkey dictation / always-on), dispatch the
+      // transcript directly to the agent without going through the text
+      // composer. Tag it voice-initiated so the core speaks any approval prompt
+      // aloud for sensitive actions (Phase 4 of #3148).
       if (customEvent.detail?.autoSend) {
-        void handleSendMessageRef.current?.(text);
+        void handleSendMessageRef.current?.(text, { voice: true });
         return;
       }
 
@@ -672,7 +676,7 @@ const Conversations = ({
     }
   };
 
-  const handleSendMessage = async (text?: string) => {
+  const handleSendMessage = async (text?: string, opts?: { voice?: boolean }) => {
     if (pendingSendRef.current) return;
 
     const normalized = text ?? inputValue;
@@ -786,6 +790,7 @@ const Conversations = ({
         model: CHAT_MODEL_ID,
         profileId: selectedAgentProfileId,
         locale: uiLocale,
+        voice: opts?.voice ?? false,
       });
       trackEvent('chat_message_sent');
       // Backend accepted the send; lifecycle ('started' → 'streaming') now

@@ -708,6 +708,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
       model: 'reasoning-v1',
       profileId: 'default',
       locale: 'en',
+      voice: false,
     });
   });
 
@@ -732,6 +733,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
         model: 'reasoning-v1',
         profileId: 'default',
         locale: 'en',
+        voice: true,
       });
     });
   });
@@ -784,6 +786,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
       model: 'reasoning-v1',
       profileId: 'default',
       locale: 'en',
+      voice: false,
     });
     expect(screen.getByRole('button', { name: 'Send message' })).toBeDisabled();
     resolveSend?.();
@@ -1149,6 +1152,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
         model: 'reasoning-v1',
         profileId: 'default',
         locale: 'en',
+        voice: false,
       });
     });
   });
@@ -1224,6 +1228,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
         model: 'reasoning-v1',
         profileId: 'default',
         locale: 'en',
+        voice: false,
       });
     });
   });

@@ -963,6 +963,13 @@ export interface ChatSendParams {
    * (default) aborts the running turn.
    */
   queueMode?: QueueMode | null;
+  /**
+   * `true` when this turn was voice-initiated (dictation / always-on
+   * listening). The core speaks the approval prompt aloud for sensitive
+   * actions on voice turns so a hands-free user can answer by voice
+   * (Phase 4 of #3148). Omitted/`false` keeps typed turns visual-only.
+   */
+  voice?: boolean | null;
 }
 
 /**
@@ -989,6 +996,7 @@ export async function chatSend(params: ChatSendParams): Promise<void> {
       profile_id: params.profileId ?? undefined,
       locale: params.locale ?? undefined,
       queue_mode: params.queueMode ?? undefined,
+      voice: params.voice ?? undefined,
     },
   });
 }

@@ -591,14 +591,31 @@ Shipped on the Windows machine (2026-06-02):
 
 ---
 
-## Phase 4 — Polish ⏳ Not Started
+## Phase 4 — Polish 🔨 In progress
 
 > Voice confirmation loop, UI indicator, computer control onboarding.
 
 **Planned:**
-- TTS confirmation before executing sensitive actions ("Opening Music — confirm?")
-- Always-on status indicator (notch pill from PR #3166 will handle this automatically)
-- Computer control (`mouse`/`keyboard` tools) toggle in Settings onboarding
+- [x] **TTS confirmation before executing sensitive actions** — ✅ Done (voice-native approval; see below)
+- [x] Always-on status indicator (notch pill from PR #3166 handles this automatically)
+- [ ] Computer control (`mouse`/`keyboard` tools) toggle in Settings onboarding
+
+### Change 4.1 — Voice-native approval (spoken TTS confirmation) ✅ Done
+
+**Status:** ✅ Shipped. The existing `ApprovalGate` already classifies sensitive agent tool calls and parks them for a yes/no decision, but the prompt was **visual-only** (the in-app approval card). A hands-free / always-on user looking away from the screen never heard it. This makes the gate **voice-native**: when a sensitive action is parked during a **voice-initiated** turn, the assistant **speaks** the prompt aloud and the user answers by voice.
+
+**Decisions (agreed up front):** make the *existing* approval gate voice-native (not a parallel voice-fast-path gate), and speak **only for voice-initiated turns** (typed approvals stay visual-only).
+
+**Fix — reuses the approval gate, the turn-origin label, and the overlay socket-bridge pattern:**
+- **Origin flag** — a `voice: bool` on `ApprovalChatContext` (set from a new optional `voice` param on the `channel_web_chat` RPC / `chat:start` socket event), stamped onto `DomainEvent::ApprovalRequested { is_voice }` at publish time in `approval/gate.rs`. The frontend tags **dictation / always-on auto-sends** (`Conversations.tsx` → `chatSend({ voice: true })`); typed turns omit it.
+- **Voice approval surface** — `src/openhuman/voice/approval_surface.rs` (new), an `EventHandler` mirroring `telegram/approval_surface.rs`: on `ApprovalRequested { is_voice: true }` it builds a short spoken line (`spoken_prompt`: `"<summary>. Say yes to confirm, or no to cancel."`) and publishes it. Registered at startup alongside the web/telegram surfaces.
+- **Speak primitive** — `src/openhuman/voice/speak_bus.rs` (new), a `publish_speak` / `subscribe_speak_events` broadcast mirroring `overlay/bus.rs`; `core/socketio.rs` bridges it to a `voice:speak` Socket.IO event (next to the `overlay:attention` bridge).
+- **Frontend playback** — `useVoiceSpeak` (new, mounted app-wide in `AppShellDesktop`) subscribes to `voice:speak` and plays it through the existing TTS pipeline (`synthesizeSpeech` → `playBase64Audio`), so the prompt is heard even when the mascot view isn't open.
+- **Spoken answer — no new code:** a voice "yes"/"no" rides the existing transcription → auto-send → `web.rs` ingress yes/no router → `approval_decide`.
+
+**Tests:** Rust — `spoken_prompt` formatter, the `is_voice` gate (speaks on `true`, **silent on `false`** and on non-approval events, no-speak on empty summary), `speak_bus` round-trip. Frontend (Vitest) — `useVoiceSpeak` synthesizes + plays on `voice:speak`, ignores empty/malformed payloads, unsubscribes on unmount.
+
+**Known follow-up:** the spoken suffix ("Say yes to confirm") is built server-side in English for v1; localizing it through the i18n system (the spoken text isn't a frontend string) is deferred.
 
 ---
 
@@ -652,6 +669,6 @@ From live agent-in-the-loop testing on 2026-06-03 (grounded in `~/.openhuman/log
 | 2 | Text-based "Hey Tiny" wake word | ✅ Done (interim; gates delivery, strips phrase) |
 | 3 | Local command router (intent classifier) | ✅ Done & wired (recognized intents run on the ≤500ms local path; Unknown defers to agent) |
 | 3 | On-device audio wake-word model | ⏳ Not started (text-based match is the interim) |
-| 4 | Voice confirmation loop | ⏳ Not started |
+| 4 | Voice confirmation loop (spoken TTS approval, Change 4.1) | ✅ Done (voice-native approval gate: speaks the prompt for voice-initiated turns; spoken yes/no) |
 | 4 | Computer-control onboarding toggle | ⏳ Not started |
 | 4 | Always-on UI indicator | ✅ Done (notch PR #3166) |
@@ -385,6 +385,12 @@ pub enum DomainEvent {
         /// Socket.IO client id (room) to surface the approval question to,
         /// when known. `None` for non-chat callers.
         client_id: Option<String>,
+        /// Whether the gated turn was **voice-initiated** (dictation / always-on
+        /// listening). When `true`, the voice approval surface
+        /// (`crate::openhuman::voice::approval_surface`) speaks the prompt aloud
+        /// so a hands-free user can answer by voice. `false` for typed turns —
+        /// they stay visual-only (the in-app approval card). Phase 4 of #3148.
+        is_voice: bool,
     },
     /// User decided a pending approval. Published by `approval_decide`
     /// RPC handler after the gate's parked future resolves.

@@ -525,6 +525,7 @@ fn approval_requested_does_not_surface_session_id() {
         args_redacted: serde_json::json!({ "tool_slug": "SLACK_SEND" }),
         thread_id: Some("t-1".to_string()),
         client_id: Some("c-1".to_string()),
+        is_voice: false,
     };
     let dbg = format!("{event:?}");
     assert!(

@@ -2155,6 +2155,8 @@ pub async fn bootstrap_core_runtime(host_kind: crate::core::types::HostKind) {
         // frontend → every prompt dies at the TTL. Idempotent (Once-guarded).
         crate::openhuman::channels::providers::web::register_approval_surface_subscriber();
         crate::openhuman::channels::providers::web::register_artifact_surface_subscriber();
+        // Speak approval prompts for voice-initiated turns (Phase 4 of #3148).
+        crate::openhuman::voice::approval_surface::register_voice_approval_surface();
     } else {
         log::error!(
             "[runtime] approval gate DISABLED (OPENHUMAN_APPROVAL_GATE=0 honored on host={}) — \