Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions app/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import PersistRehydrationScreen from './components/PersistRehydrationScreen';
import SecurityBanner from './components/SecurityBanner';
import GlobalUpsellBanner from './components/upsell/GlobalUpsellBanner';
import AppWalkthrough from './components/walkthrough/AppWalkthrough';
import { useVoiceSpeak } from './features/human/voice/useVoiceSpeak';
import { MascotFrameProducer } from './features/meet/MascotFrameProducer';
import { useNotchBootSync } from './hooks/useNotchBootSync';
import { I18nProvider } from './lib/i18n/I18nContext';
Expand Down Expand Up @@ -155,6 +156,9 @@ function AppShellDesktop() {
const location = useLocation();
const navigate = useNavigate();
const { snapshot, isBootstrapping } = useCoreState();
// Play proactive assistant speech (e.g. spoken approval prompts for
// voice-initiated turns — Phase 4 of #3148) app-wide.
useVoiceSpeak();
const activeAccountId = useAppSelector(state => state.accounts.activeAccountId);
// On /accounts, only the agent view keeps the tab bar + its reserved
// bottom padding. Any other selected "app" (e.g. WhatsApp) takes the
Expand Down
67 changes: 67 additions & 0 deletions app/src/features/human/voice/useVoiceSpeak.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import { renderHook } from '@testing-library/react';
import { beforeEach, describe, expect, it, vi } from 'vitest';

import { useVoiceSpeak } from './useVoiceSpeak';

const hoisted = vi.hoisted(() => ({
onMock: vi.fn<(event: string, cb: (...args: unknown[]) => void) => void>(),
offMock: vi.fn(),
synthMock: vi.fn(),
playMock: vi.fn(),
stopMock: vi.fn(),
}));

vi.mock('../../../services/socketService', () => ({
socketService: { on: hoisted.onMock, off: hoisted.offMock },
}));
vi.mock('./ttsClient', () => ({ synthesizeSpeech: hoisted.synthMock }));
vi.mock('./audioPlayer', () => ({ playBase64Audio: hoisted.playMock, swallowAudioStop: vi.fn() }));

/** Grab the `voice:speak` handler the hook registered with socketService. */
function speakHandler(): (...args: unknown[]) => void {
const call = hoisted.onMock.mock.calls.find(([event]) => event === 'voice:speak');
if (!call) throw new Error('useVoiceSpeak did not subscribe to voice:speak');
return call[1];
}

describe('useVoiceSpeak', () => {
beforeEach(() => {
vi.clearAllMocks();
hoisted.synthMock.mockResolvedValue({
audio_base64: 'AAA=',
audio_mime: 'audio/mpeg',
visemes: [],
});
hoisted.playMock.mockResolvedValue({ ended: Promise.resolve(), stop: hoisted.stopMock });
});

it('synthesizes and plays the spoken prompt on voice:speak', async () => {
renderHook(() => useVoiceSpeak());
speakHandler()({ text: 'Post to Slack. Say yes to confirm.', source: 'approval' });

await vi.waitFor(() => expect(hoisted.playMock).toHaveBeenCalledTimes(1));
expect(hoisted.synthMock).toHaveBeenCalledWith('Post to Slack. Say yes to confirm.');
expect(hoisted.playMock).toHaveBeenCalledWith('AAA=', 'audio/mpeg', expect.any(Object));
});

it('ignores an empty/whitespace prompt without synthesizing', async () => {
renderHook(() => useVoiceSpeak());
speakHandler()({ text: ' ' });
await Promise.resolve();
expect(hoisted.synthMock).not.toHaveBeenCalled();
expect(hoisted.playMock).not.toHaveBeenCalled();
});

it('ignores a malformed payload', async () => {
renderHook(() => useVoiceSpeak());
speakHandler()({ notText: true });
await Promise.resolve();
expect(hoisted.synthMock).not.toHaveBeenCalled();
});

it('unsubscribes and stops playback on unmount', () => {
const { unmount } = renderHook(() => useVoiceSpeak());
unmount();
expect(hoisted.offMock).toHaveBeenCalledWith('voice:speak', expect.any(Function));
});
});
72 changes: 72 additions & 0 deletions app/src/features/human/voice/useVoiceSpeak.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import debug from 'debug';
import { useEffect, useRef } from 'react';

import { socketService } from '../../../services/socketService';
import { type PlaybackHandle, playBase64Audio, swallowAudioStop } from './audioPlayer';
import { synthesizeSpeech } from './ttsClient';

const log = debug('human:voice-speak');

/** Hard cap on a single spoken prompt, guarding against runaway TTS. */
const MAX_SPEAK_MS = 20_000;

/** Payload of the core `voice:speak` socket event (mirrors `SpeakRequest`). */
interface SpeakPayload {
text: string;
source?: string | null;
}

function isSpeakPayload(value: unknown): value is SpeakPayload {
return (
typeof value === 'object' &&
value !== null &&
typeof (value as { text?: unknown }).text === 'string'
);
}

/**
* Play proactive assistant speech requested by the core via `voice:speak`.
*
* Today's only producer is the voice-native approval surface: when a sensitive
* action is parked for approval during a voice-initiated turn, the core asks the
* assistant to speak the confirmation aloud so a hands-free user can answer
* "yes"/"no" by voice (Phase 4 of #3148). Mounted once, app-wide, so the prompt
* is heard even when the mascot view isn't open — it synthesizes through the
* same TTS path the mascot uses and plays the returned audio directly.
*/
export function useVoiceSpeak(): void {
const handleRef = useRef<PlaybackHandle | null>(null);

useEffect(() => {
const onSpeak = (...args: unknown[]): void => {
const payload = args[0];
if (!isSpeakPayload(payload)) return;
const text = payload.text.trim();
if (!text) return;
log('voice:speak source=%s chars=%d', payload.source ?? 'unknown', text.length);

void (async () => {
try {
const { audio_base64: audioBase64, audio_mime: audioMime } = await synthesizeSpeech(text);
if (!audioBase64) return;
// Stop any in-flight prompt before starting the next one.
handleRef.current?.stop();
const handle = await playBase64Audio(audioBase64, audioMime || 'audio/mpeg', {
maxDurationMs: MAX_SPEAK_MS,
});
handleRef.current = handle;
handle.ended.catch(swallowAudioStop);
} catch (err) {
log('voice:speak playback failed: %o', err);
}
})();
};

socketService.on('voice:speak', onSpeak);
return () => {
socketService.off('voice:speak', onSpeak);
handleRef.current?.stop();
handleRef.current = null;
};
}, []);
}
15 changes: 10 additions & 5 deletions app/src/pages/Conversations.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ const Conversations = ({
// timer's reference point.
const sendingThreadIdRef = useRef<string | null>(null);
// Ref so the mount-time dictation event handler can call the latest send fn.
const handleSendMessageRef = useRef<((text?: string) => Promise<void>) | null>(null);
const handleSendMessageRef = useRef<
((text?: string, opts?: { voice?: boolean }) => Promise<void>) | null
>(null);
// Previous inference status for the sending thread; lets the rearm effect
// distinguish "status was just cleared (chat_done / chat_error)" from
// "status was never set yet (in-flight turn pre-status)".
Expand Down Expand Up @@ -438,10 +440,12 @@ const Conversations = ({

customEvent.preventDefault();

// When autoSend is set (hotkey dictation), dispatch the transcript directly
// to the agent without going through the text composer.
// When autoSend is set (hotkey dictation / always-on), dispatch the
// transcript directly to the agent without going through the text
// composer. Tag it voice-initiated so the core speaks any approval prompt
// aloud for sensitive actions (Phase 4 of #3148).
if (customEvent.detail?.autoSend) {
void handleSendMessageRef.current?.(text);
void handleSendMessageRef.current?.(text, { voice: true });
return;
}

Expand Down Expand Up @@ -672,7 +676,7 @@ const Conversations = ({
}
};

const handleSendMessage = async (text?: string) => {
const handleSendMessage = async (text?: string, opts?: { voice?: boolean }) => {
if (pendingSendRef.current) return;

const normalized = text ?? inputValue;
Expand Down Expand Up @@ -786,6 +790,7 @@ const Conversations = ({
model: CHAT_MODEL_ID,
profileId: selectedAgentProfileId,
locale: uiLocale,
voice: opts?.voice ?? false,
});
trackEvent('chat_message_sent');
// Backend accepted the send; lifecycle ('started' → 'streaming') now
Expand Down
5 changes: 5 additions & 0 deletions app/src/pages/__tests__/Conversations.render.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
model: 'reasoning-v1',
profileId: 'default',
locale: 'en',
voice: false,
});
});

Expand All @@ -732,6 +733,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
model: 'reasoning-v1',
profileId: 'default',
locale: 'en',
voice: true,
});
});
});
Expand Down Expand Up @@ -784,6 +786,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
model: 'reasoning-v1',
profileId: 'default',
locale: 'en',
voice: false,
});
expect(screen.getByRole('button', { name: 'Send message' })).toBeDisabled();
resolveSend?.();
Expand Down Expand Up @@ -1149,6 +1152,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
model: 'reasoning-v1',
profileId: 'default',
locale: 'en',
voice: false,
});
});
});
Expand Down Expand Up @@ -1224,6 +1228,7 @@ describe('Conversations — smoke render (#1123 welcome-lock removal)', () => {
model: 'reasoning-v1',
profileId: 'default',
locale: 'en',
voice: false,
});
});
});
Expand Down
8 changes: 8 additions & 0 deletions app/src/services/chatService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,13 @@ export interface ChatSendParams {
* (default) aborts the running turn.
*/
queueMode?: QueueMode | null;
/**
* `true` when this turn was voice-initiated (dictation / always-on
* listening). The core speaks the approval prompt aloud for sensitive
* actions on voice turns so a hands-free user can answer by voice
* (Phase 4 of #3148). Omitted/`false` keeps typed turns visual-only.
*/
voice?: boolean | null;
}

/**
Expand All @@ -989,6 +996,7 @@ export async function chatSend(params: ChatSendParams): Promise<void> {
profile_id: params.profileId ?? undefined,
locale: params.locale ?? undefined,
queue_mode: params.queueMode ?? undefined,
voice: params.voice ?? undefined,
},
});
}
Expand Down
27 changes: 22 additions & 5 deletions docs/voice-system-actions.md
Original file line number Diff line number Diff line change
Expand Up @@ -591,14 +591,31 @@ Shipped on the Windows machine (2026-06-02):

---

## Phase 4 — Polish ⏳ Not Started
## Phase 4 — Polish 🔨 In progress

> Voice confirmation loop, UI indicator, computer control onboarding.

**Planned:**
- TTS confirmation before executing sensitive actions ("Opening Music — confirm?")
- Always-on status indicator (notch pill from PR #3166 will handle this automatically)
- Computer control (`mouse`/`keyboard` tools) toggle in Settings onboarding
- [x] **TTS confirmation before executing sensitive actions** — ✅ Done (voice-native approval; see below)
- [x] Always-on status indicator (notch pill from PR #3166 handles this automatically)
- [ ] Computer control (`mouse`/`keyboard` tools) toggle in Settings onboarding

### Change 4.1 — Voice-native approval (spoken TTS confirmation) ✅ Done

**Status:** ✅ Shipped. The existing `ApprovalGate` already classifies sensitive agent tool calls and parks them for a yes/no decision, but the prompt was **visual-only** (the in-app approval card). A hands-free / always-on user looking away from the screen never heard it. This makes the gate **voice-native**: when a sensitive action is parked during a **voice-initiated** turn, the assistant **speaks** the prompt aloud and the user answers by voice.

**Decisions (agreed up front):** make the *existing* approval gate voice-native (not a parallel voice-fast-path gate), and speak **only for voice-initiated turns** (typed approvals stay visual-only).

**Fix — reuses the approval gate, the turn-origin label, and the overlay socket-bridge pattern:**
- **Origin flag** — a `voice: bool` on `ApprovalChatContext` (set from a new optional `voice` param on the `channel_web_chat` RPC / `chat:start` socket event), stamped onto `DomainEvent::ApprovalRequested { is_voice }` at publish time in `approval/gate.rs`. The frontend tags **dictation / always-on auto-sends** (`Conversations.tsx` → `chatSend({ voice: true })`); typed turns omit it.
- **Voice approval surface** — `src/openhuman/voice/approval_surface.rs` (new), an `EventHandler` mirroring `telegram/approval_surface.rs`: on `ApprovalRequested { is_voice: true }` it builds a short spoken line (`spoken_prompt`: `"<summary>. Say yes to confirm, or no to cancel."`) and publishes it. Registered at startup alongside the web/telegram surfaces.
- **Speak primitive** — `src/openhuman/voice/speak_bus.rs` (new), a `publish_speak` / `subscribe_speak_events` broadcast mirroring `overlay/bus.rs`; `core/socketio.rs` bridges it to a `voice:speak` Socket.IO event (next to the `overlay:attention` bridge).
- **Frontend playback** — `useVoiceSpeak` (new, mounted app-wide in `AppShellDesktop`) subscribes to `voice:speak` and plays it through the existing TTS pipeline (`synthesizeSpeech` → `playBase64Audio`), so the prompt is heard even when the mascot view isn't open.
- **Spoken answer — no new code:** a voice "yes"/"no" rides the existing transcription → auto-send → `web.rs` ingress yes/no router → `approval_decide`.

**Tests:** Rust — `spoken_prompt` formatter, the `is_voice` gate (speaks on `true`, **silent on `false`** and on non-approval events, no-speak on empty summary), `speak_bus` round-trip. Frontend (Vitest) — `useVoiceSpeak` synthesizes + plays on `voice:speak`, ignores empty/malformed payloads, unsubscribes on unmount.

**Known follow-up:** the spoken suffix ("Say yes to confirm") is built server-side in English for v1; localizing it through the i18n system (the spoken text isn't a frontend string) is deferred.

---

Expand Down Expand Up @@ -652,6 +669,6 @@ From live agent-in-the-loop testing on 2026-06-03 (grounded in `~/.openhuman/log
| 2 | Text-based "Hey Tiny" wake word | ✅ Done (interim; gates delivery, strips phrase) |
| 3 | Local command router (intent classifier) | ✅ Done & wired (recognized intents run on the ≤500ms local path; Unknown defers to agent) |
| 3 | On-device audio wake-word model | ⏳ Not started (text-based match is the interim) |
| 4 | Voice confirmation loop | ⏳ Not started |
| 4 | Voice confirmation loop (spoken TTS approval, Change 4.1) | ✅ Done (voice-native approval gate: speaks the prompt for voice-initiated turns; spoken yes/no) |
| 4 | Computer-control onboarding toggle | ⏳ Not started |
| 4 | Always-on UI indicator | ✅ Done (notch PR #3166) |
6 changes: 6 additions & 0 deletions src/core/event_bus/events.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,12 @@ pub enum DomainEvent {
/// Socket.IO client id (room) to surface the approval question to,
/// when known. `None` for non-chat callers.
client_id: Option<String>,
/// Whether the gated turn was **voice-initiated** (dictation / always-on
/// listening). When `true`, the voice approval surface
/// (`crate::openhuman::voice::approval_surface`) speaks the prompt aloud
/// so a hands-free user can answer by voice. `false` for typed turns —
/// they stay visual-only (the in-app approval card). Phase 4 of #3148.
is_voice: bool,
},
/// User decided a pending approval. Published by `approval_decide`
/// RPC handler after the gate's parked future resolves.
Expand Down
1 change: 1 addition & 0 deletions src/core/event_bus/events_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ fn approval_requested_does_not_surface_session_id() {
args_redacted: serde_json::json!({ "tool_slug": "SLACK_SEND" }),
thread_id: Some("t-1".to_string()),
client_id: Some("c-1".to_string()),
is_voice: false,
};
let dbg = format!("{event:?}");
assert!(
Expand Down
2 changes: 2 additions & 0 deletions src/core/jsonrpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2155,6 +2155,8 @@ pub async fn bootstrap_core_runtime(host_kind: crate::core::types::HostKind) {
// frontend → every prompt dies at the TTL. Idempotent (Once-guarded).
crate::openhuman::channels::providers::web::register_approval_surface_subscriber();
crate::openhuman::channels::providers::web::register_artifact_surface_subscriber();
// Speak approval prompts for voice-initiated turns (Phase 4 of #3148).
crate::openhuman::voice::approval_surface::register_voice_approval_surface();
} else {
log::error!(
"[runtime] approval gate DISABLED (OPENHUMAN_APPROVAL_GATE=0 honored on host={}) — \
Expand Down
Loading
Loading