Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 5 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,13 @@ For information on installation, configuration, and usage, please visit our [doc

Please see [this guide](https://docs.framepackstudio.com/docs/get_started/) on our documentation site to get FP-Studio installed.

## LoRAs
## Contributing

Add LoRAs to the /loras/ folder at the root of the installation. Select the LoRAs you wish to load and set the weights for each generation. Most Hunyuan LoRAs were originally trained for T2V, it's often helpful to run a T2V generation to ensure they're working before using input images.
We would love your help building FramePack Studio! To make collaboration effective, please adhere to the following:
- Keep Pull Requests Focused: Each Pull Request should address a single issue or add one specific feature. Please do not mix bug fixes, new features, and code refactoring in the same PR.
- Target the develop Branch: All Pull Requests must be opened against the develop branch. PRs opened against the main branch will be closed.
- Discuss Big Changes First: If you plan to work on a large feature or a significant refactor, please announce it first in the #contributors channel on our [Discord server](https://discord.com/invite/MtuM7gFJ3V). This helps us coordinate efforts and prevent duplicate work.

NOTE: Slow lora loading is a known issue

## Working with Timestamped Prompts

You can create videos with changing prompts over time using the following syntax:

```
[0s: A serene forest with sunlight filtering through the trees ]
[5s: A deer appears in the clearing ]
[10s: The deer drinks from a small stream ]
```

Each timestamp defines when that prompt should start influencing the generation. The system will (hopefully) smoothly transition between prompts for a cohesive video.

## Credits

Expand Down
114 changes: 114 additions & 0 deletions modules/MMAudio/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from typing import Optional
from pathlib import Path

import torch

from modules.MMAudio.mmaudio.eval_utils import (
ModelConfig,
all_model_cfg,
generate as mmaudio_generate,
load_video,
make_video,
)
from mmaudio.model.flow_matching import FlowMatching
from mmaudio.model.networks import MMAudio, get_my_mmaudio
from mmaudio.model.sequence_config import SequenceConfig
from mmaudio.model.utils.features_utils import FeaturesUtils

DEFAULT_AUDIO_NEGATIVE_PROMPT: list[str] = [
'music',
'noise',
]

# MMAudio Settings
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device = 'cuda'
dtype = torch.bfloat16

# Initialize MMAudio Model
def get_mmaudio_model(audio_model_config: Optional[ModelConfig] = None) -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
if audio_model_config is None:
audio_model_config = all_model_cfg['large_44k_v2']

audio_model_config.download_if_needed()

seq_cfg = audio_model_config.seq_cfg

net: MMAudio = get_my_mmaudio(audio_model_config.model_name).to(device, dtype).eval()
net.load_weights(torch.load(audio_model_config.model_path, map_location=device, weights_only=True))

feature_utils = FeaturesUtils(tod_vae_ckpt=audio_model_config.vae_path,
synchformer_ckpt=audio_model_config.synchformer_ckpt,
enable_conditions=True,
mode=audio_model_config.mode,
bigvgan_vocoder_ckpt=audio_model_config.bigvgan_16k_path,
need_vae_encoder=False)
feature_utils = feature_utils.to(device, dtype).eval()

return net, feature_utils, seq_cfg


# Audio generation function
@torch.inference_mode()
def add_audio_to_video(
video_path: Path,
prompt: str,
audio_negative_prompt: str,
audio_steps: int,
audio_cfg_strength: int,
duration: int,
audio_net: MMAudio,
audio_feature_utils: FeaturesUtils,
audio_seq_cfg: SequenceConfig,
overwrite_orig_file: bool
) -> Path:
"""Generate and add audio to video using MMAudio"""

try:
rng = torch.Generator(device=device)
rng.seed() # Random seed for audio
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)

video_info = load_video(video_path, duration)
clip_frames = video_info.clip_frames
sync_frames = video_info.sync_frames
duration = video_info.duration_sec
clip_frames = clip_frames.unsqueeze(0)
sync_frames = sync_frames.unsqueeze(0)
audio_seq_cfg.duration = duration
audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len,
audio_seq_cfg.sync_seq_len)

audios = mmaudio_generate(clip_frames, sync_frames,
text=[prompt],
negative_text=[audio_negative_prompt],
feature_utils=audio_feature_utils,
net=audio_net,
fm=fm,
rng=rng,
cfg_strength=audio_cfg_strength)
audio = audios.float().cpu()[0]

video_filename = video_path.name

if overwrite_orig_file:
# Create video with audio, in the same location as the original file
video_with_audio_filename = video_filename
else:
# Create video with audio, in the same folder with the original video
video_with_audio_filename = video_filename + "_audio.mp4"

video_dir = video_path.parent
video_with_audio_path = Path(video_dir) / video_with_audio_filename

# video_with_audio_str = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
# video_with_audio_path = Path(video_with_audio_str)

# add a 'h264' video encoded stream and a 'aac' encoded audio stream, to the file 'video_with_audio_path'
make_video(video_info, video_with_audio_path, audio, sampling_rate=audio_seq_cfg.sampling_rate)

return video_with_audio_path
except Exception as e:
print(f"Error in audio generation: {e}")
return video_path
20 changes: 20 additions & 0 deletions modules/MMAudio/mmaudio/data/av_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,27 @@ def reencode_with_audio(video_info: VideoInfo, output_path: Path, audio: torch.T
container.mux(packet)

container.close()


def reencode_without_audio(video_info: VideoInfo, output_path: Path):
container = av.open(output_path, 'w')
output_video_stream = container.add_stream('h264', video_info.fps)
output_video_stream.codec_context.bit_rate = 10 * 1e6 # 10 Mbps
output_video_stream.width = video_info.width
output_video_stream.height = video_info.height
output_video_stream.pix_fmt = 'yuv420p'

# encode video
for image in video_info.all_frames:
image = av.VideoFrame.from_ndarray(image)
packet = output_video_stream.encode(image)
container.mux(packet)

for packet in output_video_stream.encode():
container.mux(packet)

container.close()


def remux_with_audio(video_path: Path, audio: torch.Tensor, output_path: Path, sampling_rate: int):
"""
Expand Down
31 changes: 31 additions & 0 deletions modules/MMAudio/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
transformers
accelerate
safetensors
sentencepiece
peft
ftfy
imageio-ffmpeg
opencv-python

python-dotenv
cython
gitpython >= 3.1
tensorboard >= 2.11
numpy >= 1.21, <2.1
Pillow >= 9.5
scipy >= 1.7
tqdm >= 4.66.1
gradio >= 3.34
einops >= 0.6
hydra-core >= 1.3.2
requests
torchdiffeq
librosa >= 0.8.1
nitrous-ema
auraloss
hydra_colorlog
tensordict
colorlog
open_clip_torch
soundfile
av
4 changes: 3 additions & 1 deletion modules/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,9 @@ def apply_startup_settings():
def refresh_loras(current_selected):
if enumerate_lora_dir_fn:
new_lora_names = enumerate_lora_dir_fn()
preserved = [name for name in (current_selected or []) if name in new_lora_names]
preserved = [
name for name in (current_selected or []) if name in new_lora_names
]
return gr.update(choices=new_lora_names, value=preserved)
return gr.update()

Expand Down
8 changes: 6 additions & 2 deletions modules/ui/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,10 +616,14 @@ def process_with_queue_update(model_type_arg, *args):
# After refreshing available LoRAs, choices can change, but we must keep lora_loaded_names (state)
# aligned with the slider input order to avoid mixing/misalignment of weights.
stable_slider_order = list(g["lora_sliders"].keys())
incoming_weight_by_name = dict(zip(stable_slider_order, lora_slider_values_tuple))
incoming_weight_by_name = dict(
zip(stable_slider_order, lora_slider_values_tuple)
)
# Override the lora_names_states and weights passed to the backend to match the stable slider order
lora_names_states_arg = stable_slider_order
lora_slider_values_tuple = [incoming_weight_by_name.get(name, 1.0) for name in stable_slider_order]
lora_slider_values_tuple = [
incoming_weight_by_name.get(name, 1.0) for name in stable_slider_order
]

result = f["process_fn"](
backend_model_type,
Expand Down
Loading
Loading