-
Notifications
You must be signed in to change notification settings - Fork 0
/
VUI.cs
108 lines (97 loc) · 3.74 KB
/
VUI.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Text.Json.Nodes;
using System.Net.Http;
using System.Text;
using System.IO;
using StereoKit;
using System;
class VUI : Editor {
public string name { get { return "Record voice"; } }
bool isActive = false;
List<float> samples = new List<float>();
int transcribeEvery = 5 * 48000;
int transcribedChunks = 0;
MyWhisperAPIClient whisperAPIClient = new MyWhisperAPIClient();
public void DrawUI() {
if (isActive) {
UI.HSeparator();
if (!Microphone.IsRecording) {
UI.Label("Wait...");
return;
}
if (UI.Button("Stop"))
isActive = false;
}
}
public async Task<string> Edit(string initialText, Action<string> setText) {
isActive = true;
transcribedChunks = 0;
await Task.Run(() => Microphone.Start());
samples.Clear();
float[] buf = new float[24000];
while (isActive) {
int unreadSamples = Microphone.Sound.UnreadSamples;
int readSamples = Microphone.Sound.ReadSamples(ref buf);
samples.AddRange(buf[0..readSamples]);
if (unreadSamples > readSamples) {
if ((transcribedChunks + 1) * transcribeEvery - samples.Count < 0) {
setText(initialText + await TranscribeAudio());
transcribedChunks++;
}
await Task.Delay(300);
}
}
Microphone.Stop();
string result = await TranscribeAudio();
setText(initialText);
return result;
}
public async Task<string> TranscribeAudio() {
byte[] waveArray = ExportWav();
return await whisperAPIClient.SendRequest(waveArray);
}
byte[] ExportWav() {
MemoryStream stream = new MemoryStream();
using (BinaryWriter bw = new BinaryWriter(stream)) {
int bitsPerSample = 32; // Float
Int32 numChannels = 1, sampleRate = 48000; // For StereoKit
bw.Write(Encoding.ASCII.GetBytes("RIFF"));
bw.Write((Int32)(36 + samples.Count * bitsPerSample / 8));
bw.Write(Encoding.ASCII.GetBytes("WAVE"));
bw.Write(Encoding.ASCII.GetBytes("fmt "));
bw.Write((Int32)16);
bw.Write((Int16)1);
bw.Write((Int16)numChannels);
bw.Write(sampleRate);
bw.Write(sampleRate * numChannels * bitsPerSample / 8);
bw.Write((Int16)(numChannels * bitsPerSample / 8));
bw.Write((Int16)(bitsPerSample));
bw.Write(Encoding.ASCII.GetBytes("data"));
bw.Write((Int32)(samples.Count * bitsPerSample / 8));
foreach (float s in samples)
bw.Write(s);
}
return stream.ToArray();
}
}
class MyWhisperAPIClient {
HttpClient client = new HttpClient();
public String apiEndpoint = "http://127.0.0.1:7936/transcribe";
public async Task<string> SendRequest(byte[] waveArray) {
MultipartFormDataContent formData = new MultipartFormDataContent();
formData.Add(new ByteArrayContent(waveArray), "wav", "upload.wav");
HttpResponseMessage response;
try {
response = await client.PostAsync(apiEndpoint, formData);
}
catch (HttpRequestException) {
return " Looks like the audio transcription server isn't configured properly. Check it and rerun.";
}
if (!response.IsSuccessStatusCode) {
return " The audio transcription server is responding with an error. Check it and rerun.";
}
JsonNode node = JsonNode.Parse(await response.Content.ReadAsStreamAsync())!;
return ((string)node["result"]!);
}
}