|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Pre-generate phoneme test audio for the realtime harness. |
| 4 | +
|
| 5 | +Creates one file per segment in data/phoneme_prompts/: |
| 6 | +- segment_01: 1 s silence (WAV) |
| 7 | +- segment_02..15: TTS of the target sound repeated REPEAT_COUNT times (e.g. "e e e e e", "thin thin ..." for /θ/) as MP3. |
| 8 | +
|
| 9 | +At run time, --speak plays the clip for each segment in the background while the mic |
| 10 | +captures; the harness checks that the expected viseme is detected (peak activation, top-2). |
| 11 | +
|
| 12 | +TODO: Replace TTS clips with human-recorded phoneme clips (same filenames) for better |
| 13 | + pass rate and more natural test; see TODO.md (phoneme test audio). |
| 14 | +
|
| 15 | +Usage (from project root): |
| 16 | + uv run --extra realtime python tools/generate_phoneme_prompts.py |
| 17 | +""" |
| 18 | + |
| 19 | +from __future__ import annotations |
| 20 | + |
| 21 | +import asyncio |
| 22 | +import sys |
| 23 | +import wave |
| 24 | +from pathlib import Path |
| 25 | + |
| 26 | +PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| 27 | +OUT_DIR = PROJECT_ROOT / "data" / "phoneme_prompts" |
| 28 | + |
| 29 | +# Segment 1 = silence (we write a WAV). Segments 2-15 = TTS text for the sound only. |
| 30 | +# Use "thin" for TH so TTS produces the /θ/ sound, not "tee aitch". |
| 31 | +PHONEME_TEXTS = [ |
| 32 | + None, # 1: silence, no TTS |
| 33 | + "e", "ah", "eh", "oh", "oo", "p", "f", "thin", "t", "k", "sh", "s", "n", "r", |
| 34 | +] |
| 35 | +REPEAT_COUNT = 5 # Say each sound this many times per segment for a stronger test |
| 36 | +SAMPLE_RATE = 16000 |
| 37 | + |
| 38 | +VOICE = "en-GB-SoniaNeural" |
| 39 | + |
| 40 | + |
| 41 | +def write_silence_wav(path: Path, duration_sec: float = 1.0) -> None: |
| 42 | + with wave.open(str(path), "wb") as w: |
| 43 | + w.setnchannels(1) |
| 44 | + w.setsampwidth(2) |
| 45 | + w.setframerate(SAMPLE_RATE) |
| 46 | + n = int(SAMPLE_RATE * duration_sec) |
| 47 | + w.writeframes(b"\x00\x00" * n) |
| 48 | + |
| 49 | + |
| 50 | +async def main() -> None: |
| 51 | + try: |
| 52 | + import edge_tts |
| 53 | + except ImportError: |
| 54 | + print( |
| 55 | + "edge-tts is not installed. From the project root, run:\n" |
| 56 | + " uv run --extra realtime python tools/generate_phoneme_prompts.py\n" |
| 57 | + "(--extra realtime installs edge-tts into the environment, then runs this script.)", |
| 58 | + file=sys.stderr, |
| 59 | + ) |
| 60 | + sys.exit(1) |
| 61 | + |
| 62 | + OUT_DIR.mkdir(parents=True, exist_ok=True) |
| 63 | + print(f"Writing {len(PHONEME_TEXTS)} files to {OUT_DIR}") |
| 64 | + |
| 65 | + # Segment 1: silence |
| 66 | + seg1 = OUT_DIR / "segment_01.wav" |
| 67 | + write_silence_wav(seg1) |
| 68 | + print(f" {seg1.name} (silence)") |
| 69 | + |
| 70 | + for i, text in enumerate(PHONEME_TEXTS[1:], start=2): |
| 71 | + path = OUT_DIR / f"segment_{i:02d}.mp3" |
| 72 | + # Repeat the sound REPEAT_COUNT times so each segment is a solid test |
| 73 | + repeated = " ".join([text] * REPEAT_COUNT) |
| 74 | + communicate = edge_tts.Communicate(repeated, VOICE) |
| 75 | + await communicate.save(str(path)) |
| 76 | + print(f" {path.name} ({text} x{REPEAT_COUNT})") |
| 77 | + |
| 78 | + print("Done. Run phoneme-test with --speak: each segment plays this sound while the mic captures and checks the viseme.") |
| 79 | + |
| 80 | + |
| 81 | +if __name__ == "__main__": |
| 82 | + asyncio.run(main()) |
0 commit comments