Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,162 changes: 1,587 additions & 1,575 deletions .github/workflows/cicd-main.yml

Large diffs are not rendered by default.

19 changes: 16 additions & 3 deletions nemo/collections/tts/models/magpietts_preference_optimization.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import os
Expand Down Expand Up @@ -31,7 +44,7 @@ class MagpieTTSModelOfflinePODataGen(MagpieTTSModel):
This class is used in 'test' mode and leverages trainer.test() for multi-GPU/multi-node inference.
Saves the predicted audio files and logs the CER/WER metrics as individual json files for each audio.
"""

def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
super().__init__(cfg, trainer)
if cfg.get('pref_set_language', "en") == "en":
Expand Down Expand Up @@ -143,8 +156,8 @@ def test_step(self, batch, batch_idx):

class MagpieTTSModelOfflinePO(MagpieTTSModel):
"""
MagpieTTS_Model_OfflinePO is a class that extends MagpieTTS_Model to support
offline preference optimization (DPO, IPO, RPO).
MagpieTTS_Model_OfflinePO is a class that extends MagpieTTS_Model to support
offline preference optimization (DPO, IPO, RPO).
Set cfg.model.dpo_loss_type to 'dpo', 'ipo', or 'rpo' to use the corresponding loss.
"""
def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
Expand Down
13 changes: 13 additions & 0 deletions scripts/magpietts/codec_extraction.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import torch
from torch.utils.data import Dataset, DataLoader
Expand Down
18 changes: 15 additions & 3 deletions scripts/magpietts/eval_squimmos.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.from torchaudio.pipelines import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE
import os
import json
import torch
Expand All @@ -22,7 +34,7 @@ def compute_mean_and_confidence_interval(measurements, confidence=0.95):
std_err = stats.sem(measurements)

confidence_interval = std_err * stats.t.ppf((1 + confidence) / 2, len(measurements) - 1)

return "{:.4f} +/- {:.4f}".format(mean, confidence_interval), mean, confidence_interval

def main():
Expand Down Expand Up @@ -54,7 +66,7 @@ def main():
with torch.no_grad():
squm_mos_score = squim_mos_model(pred_wav, gt_wav)
squim_score_list.append(squm_mos_score.item())

mean_with_ci, mean, confidence_interval = compute_mean_and_confidence_interval(squim_score_list)
# Add to audio_dir,mean_with_ci to csv
with open(out_file, "a") as f:
Expand Down
17 changes: 15 additions & 2 deletions scripts/magpietts/evalset_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dataset_meta_info = {
'vctk': {
'manifest_path' : '/home/pneekhara/2023/SimpleT5NeMo/manifests/smallvctk__phoneme__nemo_audio_21fps_8codebooks_2kcodes_v2bWithWavLM_simplet5_withcontextaudiopaths.json',
Expand Down Expand Up @@ -29,12 +42,12 @@
'manifest_path' : '/datap/misc/speechllm_codecdatasets/manifests/t5_exp/dev_clean_withContextAudioPaths_withTargetCodes_evalset_mid.json',
'audio_dir' : '/datap/misc/Datasets/LibriTTS',
'feature_dir' : '/datap/misc/Datasets/LibriTTS',
},
},
'libri_dev_clean_eval_tiny': {
'manifest_path' : '/datap/misc/speechllm_codecdatasets/manifests/t5_exp/dev_clean_withContextAudioPaths_withTargetCodes_evalset_tiny.json',
'audio_dir' : '/datap/misc/Datasets/LibriTTS',
'feature_dir' : '/datap/misc/Datasets/LibriTTS',
},
},
'libri_val': {
'manifest_path' : '/home/pneekhara/2023/SimpleT5NeMo/manifests/libri360_val.json',
'audio_dir' : '/datap/misc/LibriTTSfromNemo/LibriTTS',
Expand Down
13 changes: 13 additions & 0 deletions scripts/magpietts/evaluate_generated_audio.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
Expand Down
15 changes: 14 additions & 1 deletion scripts/magpietts/infer_and_evaluate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import copy
import glob
Expand Down Expand Up @@ -411,7 +424,7 @@ def main():
start_prior_after_n_audio_steps=args.start_prior_after_n_audio_steps,
confidence_level=args.confidence_level,
use_local_transformer=args.use_local_transformer,
maskgit_n_steps=args.maskgit_n_steps,
maskgit_n_steps=args.maskgit_n_steps,
legacy_codebooks=args.legacy_codebooks
)
else:
Expand Down
13 changes: 13 additions & 0 deletions scripts/tts_dataset_to_lhotse/create_shars.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import json
import os
Expand Down
1 change: 1 addition & 0 deletions tests/collections/tts/modules/test_audio_codec_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def test_rvq_eval(self, num_codebooks: int):
torch.testing.assert_close(indices_enc, indices_fw, msg=f'example {i}: indices mismatch')
torch.testing.assert_close(dequantized_dec, dequantized_fw, msg=f'example {i}: dequantized mismatch')

@pytest.mark.pleasefixme
@pytest.mark.unit
@pytest.mark.parametrize('num_groups', [1, 2, 4])
@pytest.mark.parametrize('num_codebooks', [1, 4])
Expand Down
33 changes: 33 additions & 0 deletions tests/functional_tests/L2_TTS_Fast_dev_runs_Magpietts_config1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
coverage run --branch -a --data-file=/workspace/.coverage --source=/workspace/nemo examples/tts/magpietts.py \
--config-name magpietts_dc_en \
+train_ds_meta.an4.manifest_path="/home/TestData/an4_dataset/an4_train.json" \
+train_ds_meta.an4.audio_dir="/" \
+train_ds_meta.an4.tokenizer_names="[english_phoneme]" \
+train_ds_meta.an4.feature_dir=null \
+val_ds_meta.an4.manifest_path="/home/TestData/an4_dataset/an4_val.json" \
+val_ds_meta.an4.audio_dir="/" \
+val_ds_meta.an4.tokenizer_names="[english_phoneme]" \
+val_ds_meta.an4.feature_dir=null \
max_epochs=1 \
batch_size=4 \
model.codecmodel_path="/home/TestData/tts/21fps_causal_codecmodel.nemo" \
trainer.devices="[0]" \
+trainer.limit_train_batches=1 \
+trainer.limit_val_batches=1 \
trainer.strategy=auto \
model.train_ds.dataloader_params.num_workers=0 \
model.validation_ds.dataloader_params.num_workers=0 \
~trainer.check_val_every_n_epoch
Loading