Skip to content

Commit e1d7e6d

Browse files
authored
Merge pull request #12 from yuyun2000/opt/melotts
Make MeloTTS logs English & add G2P debug toggle and SOLA algorithm
2 parents 840f739 + a151aff commit e1d7e6d

File tree

3 files changed

+366
-73
lines changed

3 files changed

+366
-73
lines changed

projects/llm_framework/main_melotts/src/main.cpp

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "Lexicon.hpp"
1010
#include <ax_sys_api.h>
1111
#include "AudioFile.h"
12+
#include "SolaProcessor.h"
1213
#include "Lexicon.hpp"
1314

1415
#include <signal.h>
@@ -263,49 +264,71 @@ class llm_task {
263264
auto encoder_output =
264265
encoder_->Run(phones, tones, langids, g_matrix, mode_config_.noise_scale, mode_config_.noise_scale_w,
265266
mode_config_.get_length_scale(), mode_config_.sdp_ratio);
266-
float *zp_data = encoder_output.at(0).GetTensorMutableData<float>();
267-
int audio_len = encoder_output.at(2).GetTensorMutableData<int>()[0];
268-
auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo();
269-
auto zp_shape = zp_info.GetShape();
270-
int zp_size = decoder_->GetInputSize(0) / sizeof(float);
271-
int dec_len = zp_size / zp_shape[1];
272-
int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float);
273-
std::vector<float> decoder_output(audio_slice_len);
274-
int dec_slice_num = int(std::ceil(zp_shape[2] * 1.0 / dec_len));
267+
float *zp_data = encoder_output.at(0).GetTensorMutableData<float>();
268+
int audio_len = encoder_output.at(2).GetTensorMutableData<int>()[0];
269+
auto zp_info = encoder_output.at(0).GetTensorTypeAndShapeInfo();
270+
auto zp_shape = zp_info.GetShape();
271+
272+
// Decoder parameters setup
273+
int zp_size = decoder_->GetInputSize(0) / sizeof(float);
274+
int dec_len = zp_size / zp_shape[1];
275+
int audio_slice_len = decoder_->GetOutputSize(0) / sizeof(float);
276+
const int pad_frames = 16;
277+
const int samples_per_frame = 512;
278+
const int effective_frames = dec_len - 2 * pad_frames;
279+
int dec_slice_num =
280+
static_cast<int>(std::ceil(static_cast<double>(zp_shape[2]) / static_cast<double>(effective_frames)));
281+
SolaProcessor sola(pad_frames, samples_per_frame);
275282
std::vector<float> pcmlist;
283+
276284
for (int i = 0; i < dec_slice_num; i++) {
285+
int input_start = i * effective_frames;
286+
if (i > 0) {
287+
input_start -= pad_frames;
288+
}
289+
input_start = std::max(0, input_start);
290+
int actual_len = std::min(dec_len, static_cast<int>(zp_shape[2] - input_start));
277291
std::vector<float> zp(zp_size, 0);
278-
int actual_size = (i + 1) * dec_len < zp_shape[2] ? dec_len : zp_shape[2] - i * dec_len;
292+
279293
for (int n = 0; n < zp_shape[1]; n++) {
280-
memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + i * dec_len,
281-
sizeof(float) * actual_size);
294+
int copy_size = std::min(actual_len, static_cast<int>(zp_shape[2] - input_start));
295+
if (copy_size > 0) {
296+
memcpy(zp.data() + n * dec_len, zp_data + n * zp_shape[2] + input_start,
297+
sizeof(float) * copy_size);
298+
}
282299
}
300+
// Run decoder
301+
std::vector<float> decoder_output(audio_slice_len);
283302
decoder_->SetInput(zp.data(), 0);
284303
decoder_->SetInput(g_matrix.data(), 1);
285304
if (0 != decoder_->Run()) {
286-
printf("Run decoder model failed!\n");
287305
throw std::string("decoder_ RunSync error");
288306
}
289307
decoder_->GetOutput(decoder_output.data(), 0);
290-
actual_size = (i + 1) * audio_slice_len < audio_len ? audio_slice_len : audio_len - i * audio_slice_len;
291-
if (decoder_output.size() > actual_size) {
292-
pcmlist.reserve(pcmlist.size() + actual_size);
293-
std::copy(decoder_output.begin(), decoder_output.begin() + actual_size,
294-
std::back_inserter(pcmlist));
295-
} else {
296-
pcmlist.reserve(pcmlist.size() + decoder_output.size());
297-
std::copy(decoder_output.begin(), decoder_output.end(), std::back_inserter(pcmlist));
298-
}
308+
std::vector<float> processed_output = sola.ProcessFrame(decoder_output, i, dec_slice_num, actual_len);
309+
310+
pcmlist.insert(pcmlist.end(), processed_output.begin(), processed_output.end());
299311
}
312+
300313
double src_ratio = (mode_config_.audio_rate * 1.0f) / (mode_config_.mode_rate * 1.0f);
301314
std::vector<float> tmp_pcm((pcmlist.size() * src_ratio + 1));
302315
int len;
303316
resample_audio(pcmlist.data(), pcmlist.size(), tmp_pcm.data(), &len, src_ratio);
317+
318+
// Convert to 16-bit PCM
319+
wav_pcm_data.reserve(len);
304320
std::transform(tmp_pcm.begin(), tmp_pcm.begin() + len, std::back_inserter(wav_pcm_data),
305321
[](const auto val) { return (int16_t)(val * INT16_MAX); });
322+
323+
// Call callback function with output
306324
if (out_callback_)
307325
out_callback_(std::string((char *)wav_pcm_data.data(), wav_pcm_data.size() * sizeof(int16_t)), finish);
326+
327+
} catch (const std::exception &e) {
328+
SLOGI("TTS processing exception: %s", e.what());
329+
return true;
308330
} catch (...) {
331+
SLOGI("TTS processing encountered unknown exception");
309332
return true;
310333
}
311334
return false;

projects/llm_framework/main_melotts/src/runner/Lexicon.hpp

Lines changed: 52 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#pragma once
2-
32
#include <string>
43
#include <vector>
54
#include <fstream>
@@ -9,7 +8,15 @@
98
#include <cassert>
109
#include <iostream>
1110
#include "../../../../../SDK/components/utilities/include/sample_log.h"
12-
11+
// Debug logging switch - set to true to enable debug logs
12+
static bool DEBUG_LOGGING = false;
13+
// Macro for debug logging
14+
#define DEBUG_LOG(fmt, ...) \
15+
do { \
16+
if (DEBUG_LOGGING) { \
17+
SLOGI(fmt, ##__VA_ARGS__); \
18+
} \
19+
} while (0)
1320
std::vector<std::string> split(const std::string& s, char delim)
1421
{
1522
std::vector<std::string> result;
@@ -30,8 +37,16 @@ class Lexicon {
3037
std::unordered_map<int, std::string> reverse_tokens;
3138

3239
public:
40+
// Setter for debug logging
41+
static void setDebugLogging(bool enable)
42+
{
43+
DEBUG_LOGGING = enable;
44+
}
3345
Lexicon(const std::string& lexicon_filename, const std::string& tokens_filename) : max_phrase_length(0)
3446
{
47+
DEBUG_LOG("Dictionary loading: %s Pronunciation table loading: %s", tokens_filename.c_str(),
48+
lexicon_filename.c_str());
49+
3550
std::unordered_map<std::string, int> tokens;
3651
std::ifstream ifs(tokens_filename);
3752
assert(ifs.is_open());
@@ -82,8 +97,10 @@ class Lexicon {
8297
lexicon[""] = lexicon["."];
8398
lexicon[""] = lexicon["!"];
8499
lexicon[""] = lexicon["?"];
85-
SLOGI("词典加载完成,包含 %zu 个条目,最长词组长度: %zu", lexicon.size(), max_phrase_length);
100+
DEBUG_LOG("Dictionary loading complete, containing %zu entries, longest phrase length: %zu", lexicon.size(),
101+
max_phrase_length);
86102
}
103+
87104
std::vector<std::string> splitEachChar(const std::string& text)
88105
{
89106
std::vector<std::string> words;
@@ -94,93 +111,77 @@ class Lexicon {
94111
if ((text[i] & 0x80) == 0x00) {
95112
// ASCII
96113
} else if ((text[i] & 0xE0) == 0xC0) {
97-
next = 2; // 2字节UTF-8
114+
next = 2; // 2-byte UTF-8
98115
} else if ((text[i] & 0xF0) == 0xE0) {
99-
next = 3; // 3字节UTF-8
116+
next = 3; // 3-byte UTF-8
100117
} else if ((text[i] & 0xF8) == 0xF0) {
101-
next = 4; // 4字节UTF-8
118+
next = 4; // 4-byte UTF-8
102119
}
103120
words.push_back(text.substr(i, next));
104121
i += next;
105122
}
106123
return words;
107124
}
125+
108126
bool is_english(const std::string& s)
109127
{
110128
return s.size() == 1 && ((s[0] >= 'A' && s[0] <= 'Z') || (s[0] >= 'a' && s[0] <= 'z'));
111129
}
112-
113130
bool is_english_token_char(const std::string& s)
114131
{
115132
if (s.size() != 1) return false;
116133
char c = s[0];
117134
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '-' || c == '_';
118135
}
119-
120136
void process_unknown_english(const std::string& word, std::vector<int>& phones, std::vector<int>& tones)
121137
{
122-
SLOGI("Processing unknown term: %s", word.c_str());
123-
138+
DEBUG_LOG("Processing unknown term: %s", word.c_str());
124139
std::string orig_word = word;
125140
std::vector<std::string> parts;
126141
std::vector<std::string> phonetic_parts;
127-
128142
size_t start = 0;
129143
while (start < word.size()) {
130144
bool matched = false;
131-
132145
for (size_t len = std::min(word.size() - start, (size_t)10); len > 0 && !matched; --len) {
133146
std::string sub_word = word.substr(start, len);
134147
std::string lower_sub_word = sub_word;
135148
std::transform(lower_sub_word.begin(), lower_sub_word.end(), lower_sub_word.begin(),
136149
[](unsigned char c) { return std::tolower(c); });
137-
138150
if (lexicon.find(lower_sub_word) != lexicon.end()) {
139151
// Substring found in lexicon
140152
auto& [sub_phones, sub_tones] = lexicon[lower_sub_word];
141153
phones.insert(phones.end(), sub_phones.begin(), sub_phones.end());
142154
tones.insert(tones.end(), sub_tones.begin(), sub_tones.end());
143-
144155
parts.push_back(sub_word);
145156
phonetic_parts.push_back(phonesToString(sub_phones));
146-
147-
SLOGI(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
148-
157+
DEBUG_LOG(" Matched: '%s' -> %s", sub_word.c_str(), phonesToString(sub_phones).c_str());
149158
start += len;
150159
matched = true;
151160
break;
152161
}
153162
}
154-
155163
if (!matched) {
156164
std::string single_char = word.substr(start, 1);
157165
std::string lower_char = single_char;
158166
std::transform(lower_char.begin(), lower_char.end(), lower_char.begin(),
159167
[](unsigned char c) { return std::tolower(c); });
160-
161168
if (lexicon.find(lower_char) != lexicon.end()) {
162169
auto& [char_phones, char_tones] = lexicon[lower_char];
163170
phones.insert(phones.end(), char_phones.begin(), char_phones.end());
164171
tones.insert(tones.end(), char_tones.begin(), char_tones.end());
165-
166172
parts.push_back(single_char);
167173
phonetic_parts.push_back(phonesToString(char_phones));
168-
169-
SLOGI(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
174+
DEBUG_LOG(" Single char: '%s' -> %s", single_char.c_str(), phonesToString(char_phones).c_str());
170175
} else {
171176
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
172177
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
173-
174178
parts.push_back(single_char);
175179
phonetic_parts.push_back("_unknown_");
176-
177-
SLOGI(" Unknown: '%s'", single_char.c_str());
180+
DEBUG_LOG(" Unknown: '%s'", single_char.c_str());
178181
}
179-
180182
start++;
181183
}
182184
}
183-
184185
std::string parts_str, phonetic_str;
185186
for (size_t i = 0; i < parts.size(); i++) {
186187
if (i > 0) {
@@ -190,20 +191,20 @@ class Lexicon {
190191
parts_str += parts[i];
191192
phonetic_str += phonetic_parts[i];
192193
}
193-
194-
SLOGI("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(), phonetic_str.c_str());
194+
DEBUG_LOG("%s\t|\tDecomposed: %s\t|\tPhonetics: %s", orig_word.c_str(), parts_str.c_str(),
195+
phonetic_str.c_str());
195196
}
197+
196198
void convert(const std::string& text, std::vector<int>& phones, std::vector<int>& tones)
197199
{
198-
SLOGI("\n开始处理文本: \"%s\"", text.c_str());
199-
SLOGI("=======匹配结果=======");
200-
SLOGI("单元\t|\t音素\t|\t声调");
201-
SLOGI("-----------------------------");
200+
DEBUG_LOG("\nStarting text processing: \"%s\"", text.c_str());
201+
DEBUG_LOG("=======Matching Results=======");
202+
DEBUG_LOG("Unit\t|\tPhonemes\t|\tTones");
203+
DEBUG_LOG("-----------------------------");
202204
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
203205
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
204-
205-
SLOGI("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
206-
tonesToString(unknown_token.second).c_str());
206+
DEBUG_LOG("<BOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
207+
tonesToString(unknown_token.second).c_str());
207208
auto chars = splitEachChar(text);
208209
int i = 0;
209210
while (i < chars.size()) {
@@ -220,8 +221,8 @@ class Lexicon {
220221
auto& [eng_phones, eng_tones] = lexicon[eng_word];
221222
phones.insert(phones.end(), eng_phones.begin(), eng_phones.end());
222223
tones.insert(tones.end(), eng_tones.begin(), eng_tones.end());
223-
SLOGI("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
224-
tonesToString(eng_tones).c_str());
224+
DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_word.c_str(), phonesToString(eng_phones).c_str(),
225+
tonesToString(eng_tones).c_str());
225226
} else {
226227
process_unknown_english(orig_word, phones, tones);
227228
}
@@ -240,8 +241,8 @@ class Lexicon {
240241
auto& [phrase_phones, phrase_tones] = lexicon[phrase];
241242
phones.insert(phones.end(), phrase_phones.begin(), phrase_phones.end());
242243
tones.insert(tones.end(), phrase_tones.begin(), phrase_tones.end());
243-
SLOGI("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
244-
tonesToString(phrase_tones).c_str());
244+
DEBUG_LOG("%s\t|\t%s\t|\t%s", phrase.c_str(), phonesToString(phrase_phones).c_str(),
245+
tonesToString(phrase_tones).c_str());
245246
i += len;
246247
matched = true;
247248
break;
@@ -263,25 +264,25 @@ class Lexicon {
263264
auto& [char_phones, char_tones] = lexicon[s];
264265
phones.insert(phones.end(), char_phones.begin(), char_phones.end());
265266
tones.insert(tones.end(), char_tones.begin(), char_tones.end());
266-
SLOGI("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
267-
tonesToString(char_tones).c_str());
267+
DEBUG_LOG("%s\t|\t%s\t|\t%s", orig_char.c_str(), phonesToString(char_phones).c_str(),
268+
tonesToString(char_tones).c_str());
268269
} else {
269270
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
270271
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
271-
SLOGI("%s\t|\t%s (未匹配)\t|\t%s", orig_char.c_str(), phonesToString(unknown_token.first).c_str(),
272-
tonesToString(unknown_token.second).c_str());
272+
DEBUG_LOG("%s\t|\t%s (Not matched)\t|\t%s", orig_char.c_str(),
273+
phonesToString(unknown_token.first).c_str(), tonesToString(unknown_token.second).c_str());
273274
}
274275
}
275276
}
276277
phones.insert(phones.end(), unknown_token.first.begin(), unknown_token.first.end());
277278
tones.insert(tones.end(), unknown_token.second.begin(), unknown_token.second.end());
278-
SLOGI("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
279-
tonesToString(unknown_token.second).c_str());
280-
SLOGI("\n处理结果汇总:");
281-
SLOGI("原文: %s", text.c_str());
282-
SLOGI("音素: %s", phonesToString(phones).c_str());
283-
SLOGI("声调: %s", tonesToString(tones).c_str());
284-
SLOGI("====================");
279+
DEBUG_LOG("<EOS>\t|\t%s\t|\t%s", phonesToString(unknown_token.first).c_str(),
280+
tonesToString(unknown_token.second).c_str());
281+
DEBUG_LOG("\nProcessing Summary:");
282+
DEBUG_LOG("Original text: %s", text.c_str());
283+
DEBUG_LOG("Phonemes: %s", phonesToString(phones).c_str());
284+
DEBUG_LOG("Tones: %s", tonesToString(tones).c_str());
285+
DEBUG_LOG("====================");
285286
}
286287

287288
private:

0 commit comments

Comments
 (0)