diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml
index 28484a9d1e72..f501b7b31f19 100644
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -117,7 +117,7 @@ jobs:
           ./python.exe get-pip.py
           ./python.exe -s -m pip install ../${{ inputs.cache_tag }}_python_deps/*
 
-          grep comfyui ../ComfyUI/requirements.txt > ./requirements_comfyui.txt
+          grep comfy ../ComfyUI/requirements.txt > ./requirements_comfyui.txt
           ./python.exe -s -m pip install -r requirements_comfyui.txt
           rm requirements_comfyui.txt
 
diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml
index adfc5dd32049..63df2dc3a937 100644
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@@ -20,6 +20,7 @@ jobs:
   test-stable:
     strategy:
       fail-fast: false
+      max-parallel: 1  # This forces sequential execution
       matrix:
         # os: [macos, linux, windows]
         # os: [macos, linux]
@@ -74,6 +75,7 @@ jobs:
   test-unix-nightly:
     strategy:
       fail-fast: false
+      max-parallel: 1  # This forces sequential execution
       matrix:
         # os: [macos, linux]
         os: [linux]
diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index 9bbe30b53f75..cb4f52ce11fa 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -408,7 +408,9 @@ def __init__(self):
         self.latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]
 
 class LTXAV(LTXV):
-    pass
+    def __init__(self):
+        self.latent_rgb_factors = None
+        self.latent_rgb_factors_bias = None
 
 class HunyuanVideo(LatentFormat):
     latent_channels = 16
diff --git a/comfy/ldm/lightricks/embeddings_connector.py b/comfy/ldm/lightricks/embeddings_connector.py
index f7a43f3c32ad..06f5ada89bf3 100644
--- a/comfy/ldm/lightricks/embeddings_connector.py
+++ b/comfy/ldm/lightricks/embeddings_connector.py
@@ -276,7 +276,7 @@ def forward(
                 max(1024, hidden_states.shape[1]) / self.num_learnable_registers
             )
             learnable_registers = torch.tile(
-                self.learnable_registers, (num_registers_duplications, 1)
+                self.learnable_registers.to(hidden_states), (num_registers_duplications, 1)
             )
 
             hidden_states = torch.cat((hidden_states, learnable_registers[hidden_states.shape[1]:].unsqueeze(0).repeat(hidden_states.shape[0], 1, 1)), dim=1)
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 22f4de044de4..928282092573 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1504,6 +1504,16 @@ def supports_fp8_compute(device=None):
 
     return True
 
+def supports_nvfp4_compute(device=None):
+    if not is_nvidia():
+        return False
+
+    props = torch.cuda.get_device_properties(device)
+    if props.major < 10:
+        return False
+
+    return True
+
 def extended_fp16_support():
     # TODO: check why some models work with fp16 on newer torch versions but not on older
     if torch_version_numeric < (2, 7):
diff --git a/comfy/ops.py b/comfy/ops.py
index f5e1e9230288..cd536e22d7d0 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -427,12 +427,12 @@ def fp8_linear(self, input):
     input = torch.clamp(input, min=-448, max=448, out=input)
     input_fp8 = input.to(dtype).contiguous()
     layout_params_input = TensorCoreFP8Layout.Params(scale=scale_input, orig_dtype=input_dtype, orig_shape=tuple(input_fp8.shape))
-    quantized_input = QuantizedTensor(input_fp8, TensorCoreFP8Layout, layout_params_input)
+    quantized_input = QuantizedTensor(input_fp8, "TensorCoreFP8Layout", layout_params_input)
 
     # Wrap weight in QuantizedTensor - this enables unified dispatch
     # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
     layout_params_weight = TensorCoreFP8Layout.Params(scale=scale_weight, orig_dtype=input_dtype, orig_shape=tuple(w.shape))
-    quantized_weight = QuantizedTensor(w, TensorCoreFP8Layout, layout_params_weight)
+    quantized_weight = QuantizedTensor(w, "TensorCoreFP8Layout", layout_params_weight)
     o = torch.nn.functional.linear(quantized_input, quantized_weight, bias)
 
     uncast_bias_weight(self, w, bias, offload_stream)
@@ -493,11 +493,12 @@ def forward(self, *args, **kwargs):
 )
 
 
-def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False):
+def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False, disabled=[]):
     class MixedPrecisionOps(manual_cast):
         _quant_config = quant_config
         _compute_dtype = compute_dtype
         _full_precision_mm = full_precision_mm
+        _disabled = disabled
 
         class Linear(torch.nn.Module, CastWeightBiasOp):
             def __init__(
@@ -522,6 +523,7 @@ def __init__(
 
                 self.tensor_class = None
                 self._full_precision_mm = MixedPrecisionOps._full_precision_mm
+                self._full_precision_mm_config = False
 
             def reset_parameters(self):
                 return None
@@ -556,8 +558,12 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata,
                     self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
                 else:
                     self.quant_format = layer_conf.get("format", None)
+                    self._full_precision_mm_config = layer_conf.get("full_precision_matrix_mult", False)
                     if not self._full_precision_mm:
-                        self._full_precision_mm = layer_conf.get("full_precision_matrix_mult", False)
+                        self._full_precision_mm = self._full_precision_mm_config
+
+                    if self.quant_format in MixedPrecisionOps._disabled:
+                        self._full_precision_mm = True
 
                     if self.quant_format is None:
                         raise ValueError(f"Unknown quantization format for layer {layer_name}")
@@ -630,7 +636,7 @@ def state_dict(self, *args, destination=None, prefix="", **kwargs):
                         sd["{}weight_scale".format(prefix)] = self.weight._params.block_scale
 
                     quant_conf = {"format": self.quant_format}
-                    if self._full_precision_mm:
+                    if self._full_precision_mm_config:
                         quant_conf["full_precision_matrix_mult"] = True
                     sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8)
                 return sd
@@ -711,10 +717,17 @@ def _apply(self, fn, recurse=True):  # This is to get torch.compile + moving wei
 
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, model_config=None):
     fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
+    nvfp4_compute = comfy.model_management.supports_nvfp4_compute(load_device)
 
     if model_config and hasattr(model_config, 'quant_config') and model_config.quant_config:
         logging.info("Using mixed precision operations")
-        return mixed_precision_ops(model_config.quant_config, compute_dtype, full_precision_mm=not fp8_compute)
+        disabled = set()
+        if not nvfp4_compute:
+            disabled.add("nvfp4")
+        if not fp8_compute:
+            disabled.add("float8_e4m3fn")
+            disabled.add("float8_e5m2")
+        return mixed_precision_ops(model_config.quant_config, compute_dtype, disabled=disabled)
 
     if (
         fp8_compute and
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
index cd737726f41f..5a17bc6f50c2 100644
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -13,6 +13,13 @@
         get_layout_class,
     )
     _CK_AVAILABLE = True
+    if torch.version.cuda is None:
+        ck.registry.disable("cuda")
+    else:
+        cuda_version = tuple(map(int, str(torch.version.cuda).split('.')))
+        if cuda_version < (13,):
+            ck.registry.disable("cuda")
+
     ck.registry.disable("triton")
     for k, v in ck.list_backends().items():
         logging.info(f"Found comfy_kitchen backend {k}: {v}")
diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py
index 2c2d453e89c9..130ebaeaed94 100644
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -36,10 +36,10 @@ def __init__(self, embedding_directory=None, tokenizer_data={}):
 
 class Gemma3_12BModel(sd1_clip.SDClipModel):
     def __init__(self, device="cpu", layer="all", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
-        llama_scaled_fp8 = model_options.get("gemma_scaled_fp8", None)
-        if llama_scaled_fp8 is not None:
+        llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
+        if llama_quantization_metadata is not None:
             model_options = model_options.copy()
-            model_options["scaled_fp8"] = llama_scaled_fp8
+            model_options["quantization_metadata"] = llama_quantization_metadata
 
         super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_12B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
 
@@ -86,17 +86,19 @@ def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={})
         )
 
     def set_clip_options(self, options):
+        self.execution_device = options.get("execution_device", self.execution_device)
         self.gemma3_12b.set_clip_options(options)
 
     def reset_clip_options(self):
         self.gemma3_12b.reset_clip_options()
+        self.execution_device = None
 
     def encode_token_weights(self, token_weight_pairs):
         token_weight_pairs = token_weight_pairs["gemma3_12b"]
 
         out, pooled, extra = self.gemma3_12b.encode_token_weights(token_weight_pairs)
         out_device = out.device
-        out = out.movedim(1, -1).to(self.text_embedding_projection.weight.device)
+        out = out.movedim(1, -1).to(self.execution_device)
         out = 8.0 * (out - out.mean(dim=(1, 2), keepdim=True)) / (out.amax(dim=(1, 2), keepdim=True) - out.amin(dim=(1, 2), keepdim=True) + 1e-6)
         out = out.reshape((out.shape[0], out.shape[1], -1))
         out = self.text_embedding_projection(out)
@@ -117,12 +119,12 @@ def load_sd(self, sd):
             return self.load_state_dict(sdo, strict=False)
 
 
-def ltxav_te(dtype_llama=None, llama_scaled_fp8=None):
+def ltxav_te(dtype_llama=None, llama_quantization_metadata=None):
     class LTXAVTEModel_(LTXAVTEModel):
         def __init__(self, device="cpu", dtype=None, model_options={}):
-            if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
+            if llama_quantization_metadata is not None:
                 model_options = model_options.copy()
-                model_options["llama_scaled_fp8"] = llama_scaled_fp8
+                model_options["llama_quantization_metadata"] = llama_quantization_metadata
             if dtype_llama is not None:
                 dtype = dtype_llama
             super().__init__(dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py
index 1675fd863ad6..3e04786a965e 100644
--- a/comfy_api_nodes/nodes_wan.py
+++ b/comfy_api_nodes/nodes_wan.py
@@ -13,7 +13,9 @@
     poll_op,
     sync_op,
     tensor_to_base64_string,
+    upload_video_to_comfyapi,
     validate_audio_duration,
+    validate_video_duration,
 )
 
 
@@ -41,6 +43,12 @@ class Image2VideoInputField(BaseModel):
     audio_url: str | None = Field(None)
 
 
+class Reference2VideoInputField(BaseModel):
+    prompt: str = Field(...)
+    negative_prompt: str | None = Field(None)
+    reference_video_urls: list[str] = Field(...)
+
+
 class Txt2ImageParametersField(BaseModel):
     size: str = Field(...)
     n: int = Field(1, description="Number of images to generate.")  # we support only value=1
@@ -76,6 +84,14 @@ class Image2VideoParametersField(BaseModel):
     shot_type: str = Field("single")
 
 
+class Reference2VideoParametersField(BaseModel):
+    size: str = Field(...)
+    duration: int = Field(5, ge=5, le=15)
+    shot_type: str = Field("single")
+    seed: int = Field(..., ge=0, le=2147483647)
+    watermark: bool = Field(False)
+
+
 class Text2ImageTaskCreationRequest(BaseModel):
     model: str = Field(...)
     input: Text2ImageInputField = Field(...)
@@ -100,6 +116,12 @@ class Image2VideoTaskCreationRequest(BaseModel):
     parameters: Image2VideoParametersField = Field(...)
 
 
+class Reference2VideoTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    input: Reference2VideoInputField = Field(...)
+    parameters: Reference2VideoParametersField = Field(...)
+
+
 class TaskCreationOutputField(BaseModel):
     task_id: str = Field(...)
     task_status: str = Field(...)
@@ -721,6 +743,143 @@ async def execute(
         return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
 
 
+class WanReferenceVideoApi(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="WanReferenceVideoApi",
+            display_name="Wan Reference to Video",
+            category="api node/video/Wan",
+            description="Use the character and voice from input videos, combined with a prompt, "
+            "to generate a new video that maintains character consistency.",
+            inputs=[
+                IO.Combo.Input("model", options=["wan2.6-r2v"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Prompt describing the elements and visual features. Supports English and Chinese. "
+                    "Use identifiers such as `character1` and `character2` to refer to the reference characters.",
+                ),
+                IO.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Negative prompt describing what to avoid.",
+                ),
+                IO.Autogrow.Input(
+                    "reference_videos",
+                    template=IO.Autogrow.TemplateNames(
+                        IO.Video.Input("reference_video"),
+                        names=["character1", "character2", "character3"],
+                        min=1,
+                    ),
+                ),
+                IO.Combo.Input(
+                    "size",
+                    options=[
+                        "720p: 1:1 (960x960)",
+                        "720p: 16:9 (1280x720)",
+                        "720p: 9:16 (720x1280)",
+                        "720p: 4:3 (1088x832)",
+                        "720p: 3:4 (832x1088)",
+                        "1080p: 1:1 (1440x1440)",
+                        "1080p: 16:9 (1920x1080)",
+                        "1080p: 9:16 (1080x1920)",
+                        "1080p: 4:3 (1632x1248)",
+                        "1080p: 3:4 (1248x1632)",
+                    ],
+                ),
+                IO.Int.Input(
+                    "duration",
+                    default=5,
+                    min=5,
+                    max=10,
+                    step=5,
+                    display_mode=IO.NumberDisplay.slider,
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                ),
+                IO.Combo.Input(
+                    "shot_type",
+                    options=["single", "multi"],
+                    tooltip="Specifies the shot type for the generated video, that is, whether the video is a "
+                    "single continuous shot or multiple shots with cuts.",
+                ),
+                IO.Boolean.Input(
+                    "watermark",
+                    default=False,
+                    tooltip="Whether to add an AI-generated watermark to the result.",
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        negative_prompt: str,
+        reference_videos: IO.Autogrow.Type,
+        size: str,
+        duration: int,
+        seed: int,
+        shot_type: str,
+        watermark: bool,
+    ):
+        reference_video_urls = []
+        for i in reference_videos:
+            validate_video_duration(reference_videos[i], min_duration=2, max_duration=30)
+        for i in reference_videos:
+            reference_video_urls.append(await upload_video_to_comfyapi(cls, reference_videos[i]))
+        width, height = RES_IN_PARENS.search(size).groups()
+        initial_response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", method="POST"),
+            response_model=TaskCreationResponse,
+            data=Reference2VideoTaskCreationRequest(
+                model=model,
+                input=Reference2VideoInputField(
+                    prompt=prompt, negative_prompt=negative_prompt, reference_video_urls=reference_video_urls
+                ),
+                parameters=Reference2VideoParametersField(
+                    size=f"{width}*{height}",
+                    duration=duration,
+                    shot_type=shot_type,
+                    watermark=watermark,
+                    seed=seed,
+                ),
+            ),
+        )
+        if not initial_response.output:
+            raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
+            response_model=VideoTaskStatusResponse,
+            status_extractor=lambda x: x.output.task_status,
+            poll_interval=6,
+            max_poll_attempts=280,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
+
+
 class WanApiExtension(ComfyExtension):
     @override
     async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -729,6 +888,7 @@ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
             WanImageToImageApi,
             WanTextToVideoApi,
             WanImageToVideoApi,
+            WanReferenceVideoApi,
         ]
 
 
diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py
index b8d33f4d1438..f1ed7fe9c8f4 100644
--- a/comfy_api_nodes/util/upload_helpers.py
+++ b/comfy_api_nodes/util/upload_helpers.py
@@ -119,7 +119,7 @@ async def upload_video_to_comfyapi(
             raise ValueError(f"Could not verify video duration from source: {e}") from e
 
     upload_mime_type = f"video/{container.value.lower()}"
-    filename = f"uploaded_video.{container.value.lower()}"
+    filename = f"{uuid.uuid4()}.{container.value.lower()}"
 
     # Convert VideoInput to BytesIO using specified container/codec
     video_bytes_io = BytesIO()
diff --git a/comfyui_version.py b/comfyui_version.py
index 1ed60fe5c007..750673f08208 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.7.0"
+__version__ = "0.8.0"
diff --git a/pyproject.toml b/pyproject.toml
index a7d159be962e..951c2c97862c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.7.0"
+version = "0.8.0"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
diff --git a/requirements.txt b/requirements.txt
index 7798cb179ea8..bc8346bcf689 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.35.9
-comfyui-workflow-templates==0.7.66
+comfyui-workflow-templates==0.7.67
 comfyui-embedded-docs==0.3.1
 torch
 torchsde
@@ -21,7 +21,7 @@ psutil
 alembic
 SQLAlchemy
 av>=14.2.0
-comfy-kitchen>=0.2.2
+comfy-kitchen>=0.2.3
 
 #non essential dependencies:
 kornia>=0.7.1