Merge branch 'main' into fix-multiple-tokenizers-saved

aijadugar · web-flow · commit a9b0d95f8711 · 2025-10-24T19:58:41.000+05:30
diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@@ -25,6 +25,7 @@
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
+from ...masking_utils import create_bidirectional_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@@ -774,14 +775,19 @@ def forward(
                 lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
                 # Create mask
                 padding_mask = seq_range >= lengths_expand
+                audio_attention_mask_2d = (~padding_mask).to(dtype=torch.long, device=audio_feat_lengths.device)
 
-                audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
-                    batch_size, 1, max_seq_len, max_seq_len
+                dummy_embeds = torch.zeros(
+                    (batch_size, max_seq_len, 1),
+                    dtype=inputs_embeds.dtype,
+                    device=inputs_embeds.device,
                 )
-                audio_attention_mask = audio_attention_mask_.to(
-                    dtype=self.audio_tower.conv1.weight.dtype, device=self.audio_tower.conv1.weight.device
+
+                audio_attention_mask = create_bidirectional_mask(
+                    config=self.audio_tower.config,
+                    input_embeds=dummy_embeds,
+                    attention_mask=audio_attention_mask_2d,
                 )
-                audio_attention_mask[audio_attention_mask_] = float("-inf")
 
                 audio_outputs = self.audio_tower(input_features, attention_mask=audio_attention_mask)
                 selected_audio_feature = audio_outputs.last_hidden_state
diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
@@ -1316,7 +1316,6 @@ def __call__(
                 video_metadata = videos_inputs.pop("video_metadata")
             else:
                 video_metadata = videos_inputs["video_metadata"]
-            video_grid_thw = videos_inputs["video_grid_thw"]
         else:
             videos_inputs = {}
             video_grid_thw = None
diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@@ -157,7 +157,6 @@ def __call__(
                 video_metadata = videos_inputs.pop("video_metadata")
             else:
                 video_metadata = videos_inputs["video_metadata"]
-            video_grid_thw = videos_inputs["video_grid_thw"]
         else:
             videos_inputs = {}
             video_grid_thw = None
diff --git a/src/transformers/quantizers/quantizer_mxfp4.py b/src/transformers/quantizers/quantizer_mxfp4.py
@@ -383,6 +383,10 @@ def get_state_dict_and_metadata(self, model, safe_serialization: bool = False):
 
         state_dict = model.state_dict()
 
+        # Get num_local_experts from model config
+        num_local_experts = getattr(model.config, "num_local_experts", 32)
+        hidden_size = getattr(model.config, "hidden_size", 2880)
+
         for name, module in model.named_modules():
             if (
                 isinstance(module, Mxfp4GptOssExperts)
@@ -392,7 +396,7 @@ def get_state_dict_and_metadata(self, model, safe_serialization: bool = False):
                 state_dict[f"{name}.gate_up_proj_blocks"] = (
                     module.gate_up_proj.storage.layout.unswizzle_data(module.gate_up_proj.storage.data)
                     .transpose(-1, -2)
-                    .reshape(32, -1, 90, 16)
+                    .reshape(num_local_experts, -1, 90, 16)
                 )
                 state_dict[f"{name}.gate_up_proj_scales"] = (
                     module.gate_up_proj_precision_config.weight_scale.storage.layout.unswizzle_data(
@@ -402,7 +406,7 @@ def get_state_dict_and_metadata(self, model, safe_serialization: bool = False):
                 state_dict[f"{name}.down_proj_blocks"] = (
                     module.down_proj.storage.layout.unswizzle_data(module.down_proj.storage.data)
                     .transpose(-1, -2)
-                    .reshape(32, 2880, 90, -1)
+                    .reshape(num_local_experts, hidden_size, 90, -1)
                 )
                 state_dict[f"{name}.down_proj_scales"] = (
                     module.down_proj_precision_config.weight_scale.storage.layout.unswizzle_data(
diff --git a/src/transformers/utils/chat_parsing_utils.py b/src/transformers/utils/chat_parsing_utils.py
@@ -173,12 +173,12 @@ def recursive_parse(
             return parsed_schema
         elif isinstance(node_content, dict):
             for key, child_node in node_schema.get("properties", {}).items():
-                if key in node_content:
+                if "const" in child_node:
+                    parsed_schema[key] = child_node["const"]
+                elif key in node_content:
                     parsed_schema[key] = recursive_parse(node_content[key], child_node)
                 elif "default" in child_node:
                     parsed_schema[key] = child_node["default"]
-                else:
-                    pass
             if "additionalProperties" in node_schema:
                 for key, value in node_content.items():
                     if key not in node_schema.get("properties", {}):
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
@@ -28,7 +28,6 @@
     require_torch,
     require_torch_accelerator,
     require_torch_fp16,
-    require_torch_gpu,
     require_torch_multi_accelerator,
     require_vision,
     slow,
@@ -1734,7 +1733,7 @@ def test_inference_t5_multi_accelerator(self):
         self.assertEqual(predictions[0].tolist(), expected_ids_and_text[0])
         self.assertEqual(generated_text, expected_ids_and_text[1])
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_inference_itm(self):
         model_name = "Salesforce/blip2-itm-vit-g"
         processor = Blip2Processor.from_pretrained(model_name)
diff --git a/tests/models/falcon_h1/test_modeling_falcon_h1.py b/tests/models/falcon_h1/test_modeling_falcon_h1.py
@@ -23,7 +23,7 @@
     Expectations,
     get_device_properties,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -400,7 +400,7 @@ def test_left_padding_compatibility(self):
 
 @slow
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 class FalconH1ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_falcon_h1_hard(self):
@@ -448,10 +448,36 @@ def test_falcon_h1_hard(self):
             6.
         """
 
+        EXPECTED_TEXT_XPU = """
+            user
+            Tell me about the french revolution.
+            assistant
+            The French Revolution (1789–1799) was a period of radical social and political upheaval in France that fundamentally transformed the nation and had profound effects on the rest of Europe and the world. Here are the key aspects of the revolution:
+
+            ### **Causes**
+            1. **Economic Crisis**: France was in severe financial trouble due to costly wars (particularly the American Revolution), extravagant spending by the monarchy, and inefficient taxation.
+            2. **Social Inequality**: The rigid class system (the Ancien Régime) favored the nobility and clergy while the majority of the population (the Third Estate) bore the brunt of taxation and had limited rights.
+            3. **Enlightenment Ideas**: Philosophers like Rousseau, Voltaire, and Montesquieu inspired ideas of liberty, equality, and popular sovereignty.
+            4. **Settlement of 1789**: The Estates-General convened to address the financial crisis, leading to debates that exposed the weaknesses of the monarchy and the grievances of the common people.
+
+            ### **Key Events**
+            1. **Opening of the Revolution (1789)**:
+               - **Storming of the Bastille**: A symbol of royal tyranny, marking the start of the revolution.
+               - **Declaration of the Rights of Man and of the Citizen**: A foundational document proclaiming liberty, equality, and fraternity.
+
+            2. **Stages of the Revolution**:
+               - **Staffords' Reforms (1789–1791)**: Attempts to address grievances, including the abolition of feudal privileges and the introduction of the Civil Constitution of the Church.
+               - **Reign of Terror (1793–1794)**: Led by Maximilien Robespierre, characterized by mass executions of perceived enemies of the revolution, including King Louis XVI and Queen Marie Antoinette.
+               - **Thermidorian Reaction (1794)**: The fall of Robespierre and the end of the Reign of Terror.
+
+            3. **
+        """
+
         expected_texts = Expectations(
             {
                 (None, None): EXPECTED_TEXT_DEFAULT,
                 ("cuda", 8): EXPECTED_TEXT_A10,
+                ("xpu", None): EXPECTED_TEXT_XPU,
             }
         )
         EXPECTED_TEXT = expected_texts.get_expectation()
@@ -466,10 +492,9 @@ def test_falcon_h1_hard(self):
         model_id = "tiiuae/Falcon-H1-1.5B-Deep-Instruct"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         model = FalconH1ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")
-        device = "cuda"
         messages = [{"role": "user", "content": "Tell me about the french revolution."}]
         input_text = tokenizer.apply_chat_template(messages, tokenize=False)
-        inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+        inputs = tokenizer.encode(input_text, return_tensors="pt").to(torch_device)
 
         with torch.no_grad():
             outputs = model.generate(inputs, max_new_tokens=512, do_sample=False)
diff --git a/tests/models/helium/test_modeling_helium.py b/tests/models/helium/test_modeling_helium.py
@@ -48,7 +48,6 @@ class HeliumModelTest(CausalLMModelTest, unittest.TestCase):
 
 
 @slow
-# @require_torch_gpu
 class HeliumIntegrationTest(unittest.TestCase):
     input_text = ["Hello, today is a great day to"]
 
diff --git a/tests/quantization/bitnet_integration/test_bitnet.py b/tests/quantization/bitnet_integration/test_bitnet.py
@@ -25,7 +25,7 @@
 from transformers.testing_utils import (
     backend_empty_cache,
     require_accelerate,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -39,7 +39,7 @@
     from accelerate import init_empty_weights
 
 
-@require_torch_gpu
+@require_torch_accelerator
 class BitNetQuantConfigTest(unittest.TestCase):
     def test_to_dict(self):
         """
@@ -53,7 +53,7 @@ def test_to_dict(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 @require_accelerate
 class BitNetTest(unittest.TestCase):
     model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
@@ -197,7 +197,7 @@ def forward(self, x):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 @require_accelerate
 class BitNetSerializationTest(unittest.TestCase):
     def test_model_serialization(self):
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -4142,7 +4142,7 @@ def test_fp16_full_eval(self):
             # perfect world: fp32_init/2 == fp16_eval
             self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.torch_compile_test
     def test_torch_compile_train(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -4154,7 +4154,7 @@ def test_torch_compile_train(self):
             metrics = trainer.train()
             self.assertAlmostEqual(metrics.training_loss, original_train_loss)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @pytest.mark.torch_compile_test
     def test_torch_compile_eval(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -4165,7 +4165,7 @@ def test_torch_compile_eval(self):
             trainer = get_regression_trainer(torch_compile=True, output_dir=tmp_dir)
             metrics = trainer.evaluate()
 
-            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss)
+            self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss, delta=1e-6)
 
     @require_torch_accelerator
     @require_torch_bf16
diff --git a/tests/utils/test_chat_parsing_utils.py b/tests/utils/test_chat_parsing_utils.py
@@ -281,6 +281,7 @@ def test_smollm_template_thinking_and_tool_call(self):
         self.assertEqual(
             parsed_chat,
             {
+                "role": "assistant",
                 "thinking": 'Okay, the user said, "Hello! How are you?" I need to respond appropriately. Since this is the first message, I should greet them back and ask how I can assist. I should keep it friendly and open-ended. Let me make sure the response is welcoming and encourages them to share what they need help with. I\'ll avoid any technical jargon and keep it simple. Let me check for any typos and ensure the tone is positive.',
                 "tool_calls": [
                     {
@@ -302,9 +303,10 @@ def test_smollm_template_tool_call_no_thinking(self):
         self.assertEqual(
             parsed_chat,
             {
+                "role": "assistant",
                 "tool_calls": [
                     {"type": "function", "function": {"name": "get_weather", "arguments": {"city": "Paris"}}}
-                ]
+                ],
             },
         )
 
@@ -314,6 +316,7 @@ def test_smollm_template_thinking_no_tool_call(self):
         self.assertEqual(
             parsed_chat,
             {
+                "role": "assistant",
                 "content": "Some content about gravity goes here but I'm cutting it off to make this shorter!",
                 "thinking": 'Okay, the user asked, "Hey! Can you tell me about gravity?" Let me start by breaking down what they might be looking for. They probably want a basic understanding of gravity, maybe for a school project or just personal curiosity. I should explain what gravity is, how it works, and maybe some examples.',
             },
@@ -325,6 +328,7 @@ def test_qwen3_tool_calls(self):
         self.assertEqual(
             parsed_chat,
             {
+                "role": "assistant",
                 "tool_calls": [
                     {
                         "type": "function",
@@ -336,6 +340,6 @@ def test_qwen3_tool_calls(self):
                             },
                         },
                     }
-                ]
+                ],
             },
         )
diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py
@@ -97,7 +97,17 @@ def is_bad_commit(target_test, commit):
     # Restore to original commit
     repo.git.checkout(original_head)
 
-    return result.returncode != 0
+    n_passed = 0
+    o = re.findall(r"====.* (\d+) passed", result.stdout)
+    if len(o) > 0:
+        n_passed = int(o[0])
+
+    n_failed = 0
+    o = re.findall(r"====.* (\d+) failed", result.stdout)
+    if len(o) > 0:
+        n_failed = int(o[0])
+
+    return result.returncode != 0, n_failed, n_passed
 
 
 def find_bad_commit(target_test, start_commit, end_commit):
@@ -113,7 +123,8 @@ def find_bad_commit(target_test, start_commit, end_commit):
     """
 
     # check if `end_commit` fails the test
-    failed_before = is_bad_commit(target_test, end_commit)
+    # (we only need one failure to conclude the test is flaky on the previous run with `end_commit`)
+    failed_before, _, _ = is_bad_commit(target_test, end_commit)
     if failed_before:
         return (
             None,
@@ -130,8 +141,9 @@ def find_bad_commit(target_test, start_commit, end_commit):
 
     # Now, we are (almost) sure `target_test` is not failing at `end_commit`
     # check if `start_commit` fail the test
-    failed_now = is_bad_commit(target_test, start_commit)
-    if not failed_now:
+    # **IMPORTANT** we only need one pass to conclude the test is flaky on the current run with `start_commit`!
+    _, n_failed, n_passed = is_bad_commit(target_test, start_commit)
+    if n_passed > 0:
         # failed on CI run, but not reproducible here --> don't report
         return None, f"flaky: test fails on the current CI run (commit: {start_commit}) but passes during the check."
 
@@ -194,12 +206,13 @@ def get_commit_info(commit):
         if pr_for_commit["merged_by"] is not None:
             merged_author = pr_for_commit["merged_by"]["login"]
 
+    url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}"
+    commit_info = requests.get(url).json()
+    parent = commit_info["parents"][0]["sha"]
     if author is None:
-        url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}"
-        commit_info = requests.get(url).json()
         author = commit_info["author"]["login"]
 
-    return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author}
+    return {"commit": commit, "pr_number": pr_number, "author": author, "merged_by": merged_author, "parent": parent}
 
 
 if __name__ == "__main__":