@@ -292,31 +292,31 @@ <h3 class="text-xl font-bold text-slate-900" data-i18n="vl_bench_title">性能
292292 < thead >
293293 < tr >
294294 < th > Benchmark</ th >
295- < th > Qwen3-VL 4B</ th >
295+ <!-- < th>Qwen3-VL 4B</th> -- >
296296 < th > InternVL-3.5 4B</ th >
297297 < th > UFO 8B</ th >
298298 < th class ="text-purple-600 "> Youtu-VL 4B</ th >
299299 </ tr >
300300 </ thead >
301301 < tbody >
302302 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> Visual Grounding</ td > </ tr >
303- < tr > < td > RefCOCO val</ td > < td > 90.7% </ td > < td > 92.5%</ td > < td > 91.8%</ td > < td class ="highlight-win "> 93.6%</ td > </ tr >
304- < tr > < td > RefCOCO testA</ td > < td > 92.2% </ td > < td > 94.3%</ td > < td > 94.3%</ td > < td class ="highlight-win "> 95.2%</ td > </ tr >
305- < tr > < td > RefCOCO+ val</ td > < td > 82.9% </ td > < td > 87.6%</ td > < td > 86.9%</ td > < td class ="highlight-win "> 90.1%</ td > </ tr >
306- < tr > < td > RefCOCOg test</ td > < td > 87.7% </ td > < td > 89.3%</ td > < td > 88.6%</ td > < td class ="highlight-win "> 92.9%</ td > </ tr >
303+ < tr > < td > RefCOCO val</ td > < td > 92.5%</ td > < td > 91.8%</ td > < td class ="highlight-win "> 93.6%</ td > </ tr >
304+ < tr > < td > RefCOCO testA</ td > < td > 94.3%</ td > < td > 94.3%</ td > < td class ="highlight-win "> 95.2%</ td > </ tr >
305+ < tr > < td > RefCOCO+ val</ td > < td > 87.6%</ td > < td > 86.9%</ td > < td class ="highlight-win "> 90.1%</ td > </ tr >
306+ < tr > < td > RefCOCOg test</ td > < td > 89.3%</ td > < td > 88.6%</ td > < td class ="highlight-win "> 92.9%</ td > </ tr >
307307 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> Semantic Segmentation</ td > </ tr >
308- < tr > < td > ADE20k</ td > < td > ×</ td > < td > × </ td > < td > 54.5%</ td > < td class ="highlight-second "> 54.2%</ td > </ tr >
309- < tr > < td > Cityscapes</ td > < td > ×</ td > < td > × </ td > < td > -</ td > < td class ="highlight-win "> 70.4%</ td > </ tr >
310- < tr > < td > VOC20</ td > < td > ×</ td > < td > × </ td > < td > -</ td > < td class ="highlight-win "> 92.5%</ td > </ tr >
308+ < tr > < td > ADE20k</ td > < td > ×</ td > < td > 54.5%</ td > < td class ="highlight-second "> 54.2%</ td > </ tr >
309+ < tr > < td > Cityscapes</ td > < td > ×</ td > < td > -</ td > < td class ="highlight-win "> 70.4%</ td > </ tr >
310+ < tr > < td > VOC20</ td > < td > ×</ td > < td > -</ td > < td class ="highlight-win "> 92.5%</ td > </ tr >
311311 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> Referring Segmentation</ td > </ tr >
312- < tr > < td > RefCOCO val</ td > < td > ×</ td > < td > × </ td > < td > 80.0%</ td > < td class ="highlight-win "> 80.7%</ td > </ tr >
313- < tr > < td > RefCOCO testA</ td > < td > ×</ td > < td > × </ td > < td > 81.6%</ td > < td class ="highlight-win "> 82.0%</ td > </ tr >
312+ < tr > < td > RefCOCO val</ td > < td > ×</ td > < td > 80.0%</ td > < td class ="highlight-win "> 80.7%</ td > </ tr >
313+ < tr > < td > RefCOCO testA</ td > < td > ×</ td > < td > 81.6%</ td > < td class ="highlight-win "> 82.0%</ td > </ tr >
314314 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> Depth Estimation</ td > </ tr >
315- < tr > < td > NYUv2 (δ1)</ td > < td > ×</ td > < td > × </ td > < td > 93.6%</ td > < td class ="highlight-second "> 90.4%</ td > </ tr >
316- < tr > < td > Cityscapes</ td > < td > ×</ td > < td > × </ td > < td > -</ td > < td class ="highlight-win "> 92.7%</ td > </ tr >
315+ < tr > < td > NYUv2 (δ1)</ td > < td > ×</ td > < td > 93.6%</ td > < td class ="highlight-second "> 90.4%</ td > </ tr >
316+ < tr > < td > Cityscapes</ td > < td > ×</ td > < td > -</ td > < td class ="highlight-win "> 92.7%</ td > </ tr >
317317 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> Object Counting</ td > </ tr >
318- < tr > < td > TallyQA-Simple</ td > < td > 79.0% </ td > < td > 77.6%</ td > < td > ×</ td > < td class ="highlight-win "> 85.1%</ td > </ tr >
319- < tr > < td > CountBench</ td > < td > 78.4% </ td > < td > 79.4%</ td > < td > ×</ td > < td class ="highlight-win "> 88.6%</ td > </ tr >
318+ < tr > < td > TallyQA-Simple</ td > < td > 77.6%</ td > < td > ×</ td > < td class ="highlight-win "> 85.1%</ td > </ tr >
319+ < tr > < td > CountBench</ td > < td > 79.4%</ td > < td > ×</ td > < td class ="highlight-win "> 88.6%</ td > </ tr >
320320 </ tbody >
321321 </ table >
322322 < p class ="text-xs text-slate-500 mt-4 text-center " data-i18n ="vl_table_note1 "> * × 表示模型不支持该任务。</ p >
@@ -330,30 +330,30 @@ <h3 class="text-xl font-bold text-slate-900" data-i18n="vl_bench_title">性能
330330 < th > Benchmark</ th >
331331 < th > Qwen3-VL 8B</ th >
332332 < th > InternVL-3.5 4B</ th >
333- < th > Qwen3-VL 4B</ th >
333+ <!-- < th>Qwen3-VL 4B</th> -- >
334334 < th class ="text-purple-600 "> Youtu-VL 4B</ th >
335335 </ tr >
336336 </ thead >
337337 < tbody >
338338 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> General VQA</ td > </ tr >
339- < tr > < td > MMBench_EN</ td > < td > 84.5%</ td > < td > 80.3%</ td > < td > 83.9% </ td > < td class ="highlight-second "> 83.9%</ td > </ tr >
340- < tr > < td > MMStar</ td > < td > 70.9%</ td > < td > 65.0%</ td > < td > 69.8% </ td > < td class ="highlight-win "> 71.1%</ td > </ tr >
341- < tr > < td > MME (/2800)</ td > < td > -</ td > < td > 2272</ td > < td > 2309 </ td > < td class ="highlight-win "> 2384</ td > </ tr >
342- < tr > < td > ScienceQA_val</ td > < td > -</ td > < td > -</ td > < td > 94.7% </ td > < td class ="highlight-win "> 97.0%</ td > </ tr >
339+ < tr > < td > MMBench_EN</ td > < td > 84.5%</ td > < td > 80.3%</ td > < td class ="highlight-second "> 83.9%</ td > </ tr >
340+ < tr > < td > MMStar</ td > < td > 70.9%</ td > < td > 65.0%</ td > < td class ="highlight-win "> 71.1%</ td > </ tr >
341+ < tr > < td > MME (/2800)</ td > < td > -</ td > < td > 2272</ td > < td class ="highlight-win "> 2384</ td > </ tr >
342+ < tr > < td > ScienceQA_val</ td > < td > -</ td > < td > -</ td > < td class ="highlight-win "> 97.0%</ td > </ tr >
343343 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> Multimodal Reasoning</ td > </ tr >
344- < tr > < td > VisuLogic</ td > < td > 22.5%</ td > < td > -</ td > < td > 19.0% </ td > < td class ="highlight-win "> 25.7%</ td > </ tr >
345- < tr > < td > MathVista_mini</ td > < td > 77.2%</ td > < td > 77.1%</ td > < td > 73.7% </ td > < td class ="highlight-second "> 76.5%</ td > </ tr >
346- < tr > < td > MathVerse_mini</ td > < td > 62.1%</ td > < td > 45.8%</ td > < td > 46.8% </ td > < td class ="highlight-second "> 56.5%</ td > </ tr >
347- < tr > < td > VLMsAreBlind</ td > < td > 74.0%</ td > < td > -</ td > < td > 71.9% </ td > < td class ="highlight-win "> 88.9%</ td > </ tr >
344+ < tr > < td > VisuLogic</ td > < td > 22.5%</ td > < td > -</ td > > < td class ="highlight-win "> 25.7%</ td > </ tr >
345+ < tr > < td > MathVista_mini</ td > < td > 77.2%</ td > < td > 77.1%</ td > < td class ="highlight-second "> 76.5%</ td > </ tr >
346+ < tr > < td > MathVerse_mini</ td > < td > 62.1%</ td > < td > 45.8%</ td > < td class ="highlight-second "> 56.5%</ td > </ tr >
347+ < tr > < td > VLMsAreBlind</ td > < td > 74.0%</ td > < td > -</ td > < td class ="highlight-win "> 88.9%</ td > </ tr >
348348 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> OCR & Document</ td > </ tr >
349- < tr > < td > AI2D_test</ td > < td > 85.7%</ td > < td > 82.6%</ td > < td > 84.1% </ td > < td class ="highlight-second "> 85.6%</ td > </ tr >
350- < tr > < td > DocVQA_val</ td > < td > 96.1%</ td > < td > 92.4%</ td > < td > 95.3% </ td > < td class ="highlight-second "> 94.4%</ td > </ tr >
351- < tr > < td > ChartQA_test</ td > < td > 89.6%</ td > < td > 86.0%</ td > < td > 84.6% </ td > < td class ="highlight-second "> 85.3%</ td > </ tr >
349+ < tr > < td > AI2D_test</ td > < td > 85.7%</ td > < td > 82.6%</ td > < td class ="highlight-second "> 85.6%</ td > </ tr >
350+ < tr > < td > DocVQA_val</ td > < td > 96.1%</ td > < td > 92.4%</ td > < td class ="highlight-second "> 94.4%</ td > </ tr >
351+ < tr > < td > ChartQA_test</ td > < td > 89.6%</ td > < td > 86.0%</ td > < td class ="highlight-second "> 85.3%</ td > </ tr >
352352 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> GUI Agent</ td > </ tr >
353- < tr > < td > ScreenSpot Pro</ td > < td > 54.6%</ td > < td > -</ td > < td > 59.5% </ td > < td class ="highlight-win "> 59.6%</ td > </ tr >
354- < tr > < td > OSWorld</ td > < td > 33.9%</ td > < td > -</ td > < td > 26.2% </ td > < td class ="highlight-win "> 38.8%</ td > </ tr >
353+ < tr > < td > ScreenSpot Pro</ td > < td > 54.6%</ td > < td > -</ td > < td class ="highlight-win "> 59.6%</ td > </ tr >
354+ < tr > < td > OSWorld</ td > < td > 33.9%</ td > < td > -</ td > < td class ="highlight-win "> 38.8%</ td > </ tr >
355355 < tr > < td colspan ="5 " class ="font-semibold bg-slate-50 text-slate-700 "> Real-World</ td > </ tr >
356- < tr > < td > RealWorldQA</ td > < td > 71.5%</ td > < td > 66.3%</ td > < td > 70.9% </ td > < td class ="highlight-win "> 74.6%</ td > </ tr >
356+ < tr > < td > RealWorldQA</ td > < td > 71.5%</ td > < td > 66.3%</ td > < td class ="highlight-win "> 74.6%</ td > </ tr >
357357 </ tbody >
358358 </ table >
359359 < p class ="text-xs text-slate-500 mt-4 text-center " data-i18n ="vl_table_note2 "> * 对比同级别 VLM 指令模型表现。</ p >
0 commit comments