qwen 3 vl with apply_chat

brian-dellabetta · brian-dellabetta · commit 57e50b1885a4 · 2025-10-21T21:57:16.000Z
_template true

Signed-off-by: Brian Dellabetta &lt;bdellabe@redhat.com&gt;
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -1,18 +1,17 @@
 cadence: weekly
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: FP8_DYNAMIC
 recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
   task: mmmu_val_literature
+  apply_chat_template: True
   num_fewshot: 0
   batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  # dense model achieves accuracy of 0.833
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
+    acc,none: 0.833
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -1,6 +1,6 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
 dataset_id: lmms-lab/flickr30k
@@ -9,12 +9,11 @@ lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
   task: mmmu_val_literature
+  apply_chat_template: True
   num_fewshot: 0
   batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  # dense model achieves accuracy of 0.833
   metrics:
-    acc,none: 0.833
-    acc_stderr,none: 0.0557
+    acc,none: 0.833
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -1,6 +1,6 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: W4A16_actorder_weight
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 dataset_id: lmms-lab/flickr30k
@@ -9,12 +9,11 @@ lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
   task: mmmu_val_literature
+  apply_chat_template: True
   num_fewshot: 0
   batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  # dense model achieves accuracy of 0.8333
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
+    acc,none: 0.800
diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
@@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel):
     num_fewshot: int = 5
     limit: int = 1000
     batch_size: int = 100
+    apply_chat_template: bool = False
     # Recovery testing (default): compare against base model performance
     # Default threshold is 0.95 (retain ≥95% of base), can be overridden
     recovery_threshold: Union[float, dict] = 0.95
@@ -160,6 +161,7 @@ def _eval_base_model(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )
 
@@ -190,6 +192,7 @@ def _run_lm_eval(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )