@@ -554,6 +554,8 @@ def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None)
554554 elif "decoder.layers.1.layers.0.beta" in sd :
555555 config = {}
556556 param_key = None
557+ self .upscale_ratio = 2048
558+ self .downscale_ratio = 2048
557559 if "decoder.layers.2.layers.1.weight_v" in sd :
558560 param_key = "decoder.layers.2.layers.1.weight_v"
559561 if "decoder.layers.2.layers.1.parametrizations.weight.original1" in sd :
@@ -562,15 +564,15 @@ def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None)
562564 if sd [param_key ].shape [- 1 ] == 12 :
563565 config ["strides" ] = [2 , 4 , 4 , 6 , 10 ]
564566 self .audio_sample_rate = 48000
567+ self .upscale_ratio = 1920
568+ self .downscale_ratio = 1920
565569
566570 self .first_stage_model = AudioOobleckVAE (** config )
567571 self .memory_used_encode = lambda shape , dtype : (1000 * shape [2 ]) * model_management .dtype_size (dtype )
568572 self .memory_used_decode = lambda shape , dtype : (1000 * shape [2 ] * 2048 ) * model_management .dtype_size (dtype )
569573 self .latent_channels = 64
570574 self .output_channels = 2
571575 self .pad_channel_value = "replicate"
572- self .upscale_ratio = 2048
573- self .downscale_ratio = 2048
574576 self .latent_dim = 1
575577 self .process_output = lambda audio : audio
576578 self .process_input = lambda audio : audio
@@ -870,7 +872,7 @@ def decode_tiled_(self, samples, tile_x=64, tile_y=64, overlap = 16):
870872 / 3.0 )
871873 return output
872874
873- def decode_tiled_1d (self , samples , tile_x = 128 , overlap = 32 ):
875+ def decode_tiled_1d (self , samples , tile_x = 256 , overlap = 32 ):
874876 if samples .ndim == 3 :
875877 decode_fn = lambda a : self .first_stage_model .decode (a .to (self .vae_dtype ).to (self .device )).float ()
876878 else :
0 commit comments