code · pull · Sep 5, 2025 · Sep 4, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/.github/workflows/test-execution.yml b/.github/workflows/test-execution.yml
@@ -0,0 +1,30 @@
+name: Execution Tests
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    continue-on-error: true
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python      
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+        pip install -r tests-unit/requirements.txt
+    - name: Run Execution Tests
+      run: |
+        python -m pytest tests/execution -v --skip-timing-checks
diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py
@@ -136,8 +136,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
         else:
             json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
-    elif "embeddings.patch_embeddings.projection.weight" in sd:
+
+    # Dinov2
+    elif 'encoder.layer.39.layer_scale2.lambda1' in sd:
         json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
+    elif 'encoder.layer.23.layer_scale2.lambda1' in sd:
+        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_large.json")
     else:
         return None
 

diff --git a/comfy/image_encoders/dino2.py b/comfy/image_encoders/dino2.py
@@ -31,6 +31,20 @@ def __init__(self, dim, dtype, device, operations):
     def forward(self, x):
         return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
 
+class Dinov2MLP(torch.nn.Module):
+    def __init__(self, hidden_size: int, dtype, device, operations):
+        super().__init__()
+
+        mlp_ratio = 4
+        hidden_features = int(hidden_size * mlp_ratio)
+        self.fc1 = operations.Linear(hidden_size, hidden_features, bias = True, device=device, dtype=dtype)
+        self.fc2 = operations.Linear(hidden_features, hidden_size, bias = True, device=device, dtype=dtype)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = torch.nn.functional.gelu(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
 
 class SwiGLUFFN(torch.nn.Module):
     def __init__(self, dim, dtype, device, operations):
@@ -50,12 +64,15 @@ def forward(self, x):
 
 
 class Dino2Block(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
+    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn):
         super().__init__()
         self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
         self.layer_scale1 = LayerScale(dim, dtype, device, operations)
         self.layer_scale2 = LayerScale(dim, dtype, device, operations)
-        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
+        if use_swiglu_ffn:
+            self.mlp = SwiGLUFFN(dim, dtype, device, operations)
+        else:
+            self.mlp = Dinov2MLP(dim, dtype, device, operations)
         self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
         self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
 
@@ -66,9 +83,10 @@ def forward(self, x, optimized_attention):
 
 
 class Dino2Encoder(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
+    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn):
         super().__init__()
-        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
+        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
+                                          for _ in range(num_layers)])
 
     def forward(self, x, intermediate_output=None):
         optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
@@ -78,8 +96,8 @@ def forward(self, x, intermediate_output=None):
                 intermediate_output = len(self.layer) + intermediate_output
 
         intermediate = None
-        for i, l in enumerate(self.layer):
-            x = l(x, optimized_attention)
+        for i, layer in enumerate(self.layer):
+            x = layer(x, optimized_attention)
             if i == intermediate_output:
                 intermediate = x.clone()
         return x, intermediate
@@ -128,9 +146,10 @@ def __init__(self, config_dict, dtype, device, operations):
         dim = config_dict["hidden_size"]
         heads = config_dict["num_attention_heads"]
         layer_norm_eps = config_dict["layer_norm_eps"]
+        use_swiglu_ffn = config_dict["use_swiglu_ffn"]
 
         self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
-        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
+        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn)
         self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
 
     def forward(self, pixel_values, attention_mask=None, intermediate_output=None):

diff --git a/comfy/image_encoders/dino2_large.json b/comfy/image_encoders/dino2_large.json
@@ -0,0 +1,22 @@
+{
+  "hidden_size": 1024,
+  "use_mask_token": true,
+  "patch_size": 14,
+  "image_size": 518,
+  "num_channels": 3,
+  "num_attention_heads": 16,
+  "initializer_range": 0.02,
+  "attention_probs_dropout_prob": 0.0,
+  "hidden_dropout_prob": 0.0,
+  "hidden_act": "gelu",
+  "mlp_ratio": 4,
+  "model_type": "dinov2",
+  "num_hidden_layers": 24,
+  "layer_norm_eps": 1e-6,
+  "qkv_bias": true,
+  "use_swiglu_ffn": false,
+  "layerscale_value": 1.0,
+  "drop_path_rate": 0.0,
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std": [0.229, 0.224, 0.225]
+}
diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
@@ -538,6 +538,11 @@ class Hunyuan3Dv2(LatentFormat):
     latent_dimensions = 1
     scale_factor = 0.9990943042622529
 
+class Hunyuan3Dv2_1(LatentFormat):
+    scale_factor = 1.0039506158752403
+    latent_channels = 64
+    latent_dimensions = 1
+
 class Hunyuan3Dv2mini(LatentFormat):
     latent_channels = 64
     latent_dimensions = 1