FP-Studio · colinurbs · Mar 7, 2026 · Jul 17, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/README.md b/README.md
@@ -37,23 +37,13 @@ For information on installation, configuration, and usage, please visit our [doc
 
 Please see [this guide](https://docs.framepackstudio.com/docs/get_started/) on our documentation site to get FP-Studio installed.
 
-## LoRAs
+## Contributing 
 
-Add LoRAs to the /loras/ folder at the root of the installation. Select the LoRAs you wish to load and set the weights for each generation. Most Hunyuan LoRAs were originally trained for T2V, it's often helpful to run a T2V generation to ensure they're working before using input images.
+We would love your help building FramePack Studio! To make collaboration effective, please adhere to the following:
+- Keep Pull Requests Focused: Each Pull Request should address a single issue or add one specific feature. Please do not mix bug fixes, new features, and code refactoring in the same PR.
+- Target the develop Branch: All Pull Requests must be opened against the develop branch. PRs opened against the main branch will be closed.
+- Discuss Big Changes First: If you plan to work on a large feature or a significant refactor, please announce it first in the #contributors channel on our [Discord server](https://discord.com/invite/MtuM7gFJ3V). This helps us coordinate efforts and prevent duplicate work.
 
-NOTE: Slow lora loading is a known issue
-
-## Working with Timestamped Prompts
-
-You can create videos with changing prompts over time using the following syntax:
-
-```
-[0s: A serene forest with sunlight filtering through the trees ]
-[5s: A deer appears in the clearing ]
-[10s: The deer drinks from a small stream ]
-```
-
-Each timestamp defines when that prompt should start influencing the generation. The system will (hopefully) smoothly transition between prompts for a cohesive video.
 
 ## Credits
 

diff --git a/modules/MMAudio/app.py b/modules/MMAudio/app.py
@@ -0,0 +1,114 @@
+from typing import Optional
+from pathlib import Path
+
+import torch
+
+from modules.MMAudio.mmaudio.eval_utils import (
+    ModelConfig,
+    all_model_cfg,
+    generate as mmaudio_generate,
+    load_video,
+    make_video,
+)
+from mmaudio.model.flow_matching import FlowMatching
+from mmaudio.model.networks import MMAudio, get_my_mmaudio
+from mmaudio.model.sequence_config import SequenceConfig
+from mmaudio.model.utils.features_utils import FeaturesUtils
+
+DEFAULT_AUDIO_NEGATIVE_PROMPT: list[str] = [
+    'music',
+    'noise',
+]
+
+# MMAudio Settings
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+device = 'cuda'
+dtype = torch.bfloat16
+
+# Initialize MMAudio Model
+def get_mmaudio_model(audio_model_config: Optional[ModelConfig] = None) -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
+    if audio_model_config is None:
+        audio_model_config = all_model_cfg['large_44k_v2']
+
+    audio_model_config.download_if_needed()
+
+    seq_cfg = audio_model_config.seq_cfg
+
+    net: MMAudio = get_my_mmaudio(audio_model_config.model_name).to(device, dtype).eval()
+    net.load_weights(torch.load(audio_model_config.model_path, map_location=device, weights_only=True))
+
+    feature_utils = FeaturesUtils(tod_vae_ckpt=audio_model_config.vae_path,
+                                  synchformer_ckpt=audio_model_config.synchformer_ckpt,
+                                  enable_conditions=True,
+                                  mode=audio_model_config.mode,
+                                  bigvgan_vocoder_ckpt=audio_model_config.bigvgan_16k_path,
+                                  need_vae_encoder=False)
+    feature_utils = feature_utils.to(device, dtype).eval()
+
+    return net, feature_utils, seq_cfg
+
+
+# Audio generation function
+@torch.inference_mode()
+def add_audio_to_video(
+        video_path: Path,
+        prompt: str,
+        audio_negative_prompt: str,
+        audio_steps: int,
+        audio_cfg_strength: int,
+        duration: int,
+        audio_net: MMAudio,
+        audio_feature_utils: FeaturesUtils,
+        audio_seq_cfg: SequenceConfig,
+        overwrite_orig_file: bool
+) -> Path:
+    """Generate and add audio to video using MMAudio"""
+
+    try:
+        rng = torch.Generator(device=device)
+        rng.seed()  # Random seed for audio
+        fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=audio_steps)
+
+        video_info = load_video(video_path, duration)
+        clip_frames = video_info.clip_frames
+        sync_frames = video_info.sync_frames
+        duration = video_info.duration_sec
+        clip_frames = clip_frames.unsqueeze(0)
+        sync_frames = sync_frames.unsqueeze(0)
+        audio_seq_cfg.duration = duration
+        audio_net.update_seq_lengths(audio_seq_cfg.latent_seq_len, audio_seq_cfg.clip_seq_len,
+                                     audio_seq_cfg.sync_seq_len)
+
+        audios = mmaudio_generate(clip_frames, sync_frames,
+                                  text=[prompt],
+                                  negative_text=[audio_negative_prompt],
+                                  feature_utils=audio_feature_utils,
+                                  net=audio_net,
+                                  fm=fm,
+                                  rng=rng,
+                                  cfg_strength=audio_cfg_strength)
+        audio = audios.float().cpu()[0]
+
+        video_filename = video_path.name
+
+        if overwrite_orig_file:
+            # Create video with audio, in the same location as the original file
+            video_with_audio_filename = video_filename
+        else:
+            # Create video with audio, in the same folder with the original video
+            video_with_audio_filename = video_filename + "_audio.mp4"
+
+        video_dir = video_path.parent
+        video_with_audio_path = Path(video_dir) / video_with_audio_filename
+
+        # video_with_audio_str = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
+        # video_with_audio_path = Path(video_with_audio_str)
+
+        # add a 'h264' video encoded stream and a 'aac' encoded audio stream, to the file 'video_with_audio_path'
+        make_video(video_info, video_with_audio_path, audio, sampling_rate=audio_seq_cfg.sampling_rate)
+
+        return video_with_audio_path
+    except Exception as e:
+        print(f"Error in audio generation: {e}")
+        return video_path
diff --git a/modules/MMAudio/mmaudio/data/av_utils.py b/modules/MMAudio/mmaudio/data/av_utils.py
@@ -122,7 +122,27 @@ def reencode_with_audio(video_info: VideoInfo, output_path: Path, audio: torch.T
         container.mux(packet)
 
     container.close()
+
 
+def reencode_without_audio(video_info: VideoInfo, output_path: Path):
+    container = av.open(output_path, 'w')
+    output_video_stream = container.add_stream('h264', video_info.fps)
+    output_video_stream.codec_context.bit_rate = 10 * 1e6  # 10 Mbps
+    output_video_stream.width = video_info.width
+    output_video_stream.height = video_info.height
+    output_video_stream.pix_fmt = 'yuv420p'
+
+    # encode video
+    for image in video_info.all_frames:
+        image = av.VideoFrame.from_ndarray(image)
+        packet = output_video_stream.encode(image)
+        container.mux(packet)
+
+    for packet in output_video_stream.encode():
+        container.mux(packet)
+
+    container.close()
+
 
 def remux_with_audio(video_path: Path, audio: torch.Tensor, output_path: Path, sampling_rate: int):
     """

diff --git a/modules/MMAudio/requirements.txt b/modules/MMAudio/requirements.txt
@@ -0,0 +1,31 @@
+transformers
+accelerate
+safetensors
+sentencepiece
+peft
+ftfy
+imageio-ffmpeg
+opencv-python
+
+python-dotenv
+cython
+gitpython >= 3.1
+tensorboard >= 2.11
+numpy >= 1.21, <2.1
+Pillow >= 9.5
+scipy >= 1.7
+tqdm >= 4.66.1
+gradio >= 3.34
+einops >= 0.6
+hydra-core >= 1.3.2
+requests
+torchdiffeq
+librosa >= 0.8.1
+nitrous-ema
+auraloss
+hydra_colorlog
+tensordict
+colorlog
+open_clip_torch
+soundfile
+av
diff --git a/modules/interface.py b/modules/interface.py
@@ -337,7 +337,9 @@ def apply_startup_settings():
         def refresh_loras(current_selected):
             if enumerate_lora_dir_fn:
                 new_lora_names = enumerate_lora_dir_fn()
-                preserved = [name for name in (current_selected or []) if name in new_lora_names]
+                preserved = [
+                    name for name in (current_selected or []) if name in new_lora_names
+                ]
                 return gr.update(choices=new_lora_names, value=preserved)
             return gr.update()
 

diff --git a/modules/ui/generate.py b/modules/ui/generate.py
@@ -616,10 +616,14 @@ def process_with_queue_update(model_type_arg, *args):
         # After refreshing available LoRAs, choices can change, but we must keep lora_loaded_names (state)
         # aligned with the slider input order to avoid mixing/misalignment of weights.
         stable_slider_order = list(g["lora_sliders"].keys())
-        incoming_weight_by_name = dict(zip(stable_slider_order, lora_slider_values_tuple))
+        incoming_weight_by_name = dict(
+            zip(stable_slider_order, lora_slider_values_tuple)
+        )
         # Override the lora_names_states and weights passed to the backend to match the stable slider order
         lora_names_states_arg = stable_slider_order
-        lora_slider_values_tuple = [incoming_weight_by_name.get(name, 1.0) for name in stable_slider_order]
+        lora_slider_values_tuple = [
+            incoming_weight_by_name.get(name, 1.0) for name in stable_slider_order
+        ]
 
         result = f["process_fn"](
             backend_model_type,