@@ -3,6 +3,8 @@ defaults:
3
3
- _self_
4
4
5
5
project : vqgan_pretrain_v2
6
+ ckpt_path : checkpoints/hifigan-base-comb-mix-lb-020/step_001200000_weights_only.ckpt
7
+ resume_weights_only : true
6
8
7
9
# Lightning Trainer
8
10
trainer :
@@ -15,22 +17,36 @@ trainer:
15
17
16
18
sample_rate : 44100
17
19
hop_length : 512
18
- num_mels : 128
20
+ num_mels : 160
19
21
n_fft : 2048
20
22
win_length : 2048
21
23
segment_size : 256
22
24
23
25
# Dataset Configuration
24
26
train_dataset :
25
- _target_ : fish_speech.datasets.vqgan.VQGANDataset
26
- filelist : data/Genshin/vq_train_filelist.txt
27
- sample_rate : ${sample_rate}
28
- hop_length : ${hop_length}
29
- slice_frames : ${segment_size}
27
+ _target_ : fish_speech.datasets.vqgan.MixDatast
28
+ datasets :
29
+ high-quality-441 :
30
+ prob : 0.5
31
+ dataset :
32
+ _target_ : fish_speech.datasets.vqgan.VQGANDataset
33
+ filelist : data/vocoder_data_441/vq_train_filelist.txt
34
+ sample_rate : ${sample_rate}
35
+ hop_length : ${hop_length}
36
+ slice_frames : ${segment_size}
37
+
38
+ common-voice :
39
+ prob : 0.5
40
+ dataset :
41
+ _target_ : fish_speech.datasets.vqgan.VQGANDataset
42
+ filelist : data/cv-corpus-16.0-2023-12-06/vq_train_filelist.txt
43
+ sample_rate : ${sample_rate}
44
+ hop_length : ${hop_length}
45
+ slice_frames : ${segment_size}
30
46
31
47
val_dataset :
32
48
_target_ : fish_speech.datasets.vqgan.VQGANDataset
33
- filelist : data/Genshin /vq_val_filelist.txt
49
+ filelist : data/vocoder_data_441 /vq_val_filelist.txt
34
50
sample_rate : ${sample_rate}
35
51
hop_length : ${hop_length}
36
52
47
63
_target_ : fish_speech.models.vqgan.VQGAN
48
64
sample_rate : ${sample_rate}
49
65
hop_length : ${hop_length}
50
- segment_size : 8192
51
- mode : pretrain-stage1
66
+ segment_size : 32768
67
+ mode : pretrain
68
+ freeze_discriminator : true
52
69
53
70
downsample :
54
71
_target_ : fish_speech.models.vqgan.modules.encoders.ConvDownSampler
67
84
_target_ : fish_speech.models.vqgan.modules.encoders.VQEncoder
68
85
in_channels : 256
69
86
vq_channels : 256
70
- codebook_size : 1024
71
- codebook_layers : 4
87
+ codebook_size : 256
88
+ codebook_groups : 4
72
89
downsample : 1
73
90
74
91
decoder :
@@ -80,33 +97,50 @@ model:
80
97
n_layers : 6
81
98
82
99
generator :
83
- _target_ : fish_speech.models.vqgan.modules.decoder.Generator
84
- initial_channel : ${num_mels}
85
- resblock : " 1"
100
+ _target_ : fish_speech.models.vqgan.modules.decoder_v2.HiFiGANGenerator
101
+ hop_length : ${hop_length}
102
+ upsample_rates : [8, 8, 2, 2, 2] # aka. strides
103
+ upsample_kernel_sizes : [16, 16, 4, 4, 4]
86
104
resblock_kernel_sizes : [3, 7, 11]
87
105
resblock_dilation_sizes : [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
88
- upsample_rates : [8, 8, 2, 2, 2]
106
+ num_mels : ${num_mels}
89
107
upsample_initial_channel : 512
90
- upsample_kernel_sizes : [16, 16, 4, 4, 4]
91
-
92
- discriminator :
93
- _target_ : fish_speech.models.vqgan.modules.discriminator.EnsembleDiscriminator
94
- periods : [2, 3, 5, 7, 11, 17, 23, 37]
95
-
108
+ use_template : true
109
+ pre_conv_kernel_size : 7
110
+ post_conv_kernel_size : 7
111
+
112
+ discriminators :
113
+ _target_ : torch.nn.ModuleDict
114
+ modules :
115
+ mpd :
116
+ _target_ : fish_speech.models.vqgan.modules.discriminators.mpd.MultiPeriodDiscriminator
117
+ periods : [2, 3, 5, 7, 11, 17, 23, 37]
118
+
119
+ mrd :
120
+ _target_ : fish_speech.models.vqgan.modules.discriminators.mrd.MultiResolutionDiscriminator
121
+ resolutions :
122
+ - ["${n_fft}", "${hop_length}", "${win_length}"]
123
+ - [1024, 120, 600]
124
+ - [2048, 240, 1200]
125
+ - [4096, 480, 2400]
126
+ - [512, 50, 240]
127
+
128
+ multi_resolution_stft_loss :
129
+ _target_ : fish_speech.models.vqgan.losses.MultiResolutionSTFTLoss
130
+ resolutions : ${model.discriminators.modules.mrd.resolutions}
131
+
96
132
mel_transform :
97
133
_target_ : fish_speech.models.vqgan.spectrogram.LogMelSpectrogram
98
134
sample_rate : ${sample_rate}
99
135
n_fft : ${n_fft}
100
136
hop_length : ${hop_length}
101
137
win_length : ${win_length}
102
138
n_mels : ${num_mels}
103
- f_min : 0
104
- f_max : 16000
105
139
106
140
optimizer :
107
141
_target_ : torch.optim.AdamW
108
142
_partial_ : true
109
- lr : 2e -4
143
+ lr : 1e -4
110
144
betas : [0.8, 0.99]
111
145
eps : 1e-5
112
146
@@ -119,7 +153,7 @@ callbacks:
119
153
grad_norm_monitor :
120
154
sub_module :
121
155
- generator
122
- - discriminator
156
+ - discriminators
123
157
- mel_encoder
124
158
- vq_encoder
125
159
- decoder
0 commit comments