-
Notifications
You must be signed in to change notification settings - Fork 82
Open
Description
The default tiling config in training (train_recammaster.py) is tiled=False, tile_size=(34,34), tile_stride=(18, 16),while the default config in inferencing (inference_recammaster.py, diffsynth/pipelines/wan_video.py) is tiled=True, tile_size=(30, 52), tile_stride=(15, 26).
This mismatch can cause about 0.2 absolute error in encoded VAE features. Is this a significant issue?
# train_recammaster.py
parser.add_argument(
"--tiled",
default=False,
action="store_true",
help="Whether enable tile encode in VAE. This option can reduce VRAM required.",
)
parser.add_argument(
"--tile_size_height",
type=int,
default=34,
help="Tile size (height) in VAE.",
)
parser.add_argument(
"--tile_size_width",
type=int,
default=34,
help="Tile size (width) in VAE.",
)
parser.add_argument(
"--tile_stride_height",
type=int,
default=18,
help="Tile stride (height) in VAE.",
)
parser.add_argument(
"--tile_stride_width",
type=int,
default=16,
help="Tile stride (width) in VAE.",
)# inference_recammaster.py
video = pipe(
prompt=target_text,
negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
source_video=source_video,
target_camera=target_camera,
cfg_scale=args.cfg_scale,
num_inference_steps=50,
seed=0, tiled=True
)# diffsynth/pipelines/wan_video.py
@torch.no_grad()
def __call__(
self,
prompt,
negative_prompt="",
source_video=None,
target_camera=None,
input_image=None,
input_video=None,
denoising_strength=1.0,
seed=None,
rand_device="cpu",
height=480,
width=832,
num_frames=81,
cfg_scale=5.0,
num_inference_steps=50,
sigma_shift=5.0,
tiled=True,
tile_size=(30, 52),
tile_stride=(15, 26),
tea_cache_l1_thresh=None,
tea_cache_model_id="",
progress_bar_cmd=tqdm,
progress_bar_st=None,
):Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels