|
2 | 2 | import unittest |
3 | 3 | from copy import deepcopy |
4 | 4 | from dataclasses import dataclass |
| 5 | +from pathlib import Path |
5 | 6 | from typing import List |
6 | 7 |
|
7 | 8 | import pytest |
@@ -107,20 +108,24 @@ def __repr__(self) -> str: |
107 | 108 | class TestQwen2_5_VL(unittest.TestCase): |
108 | 109 |
|
109 | 110 | def get_test_inputs(self, modality: str): |
| 111 | + |
| 112 | + test_data_root = Path( |
| 113 | + os.path.join(llm_models_root(), "multimodals", "test_data")) |
| 114 | + |
110 | 115 | if modality == "image": |
111 | 116 | return ["Describe the natural environment in the image."], \ |
112 | | - ["https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"] |
| 117 | + [str(test_data_root / "seashore.png")] |
113 | 118 | elif modality == "multiple_image": |
114 | 119 | return ["Describe the difference between the two images."], \ |
115 | | - ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png", |
116 | | - "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg"] |
| 120 | + [str(test_data_root / "inpaint.png"), |
| 121 | + str(test_data_root / "61.jpg")] |
117 | 122 | elif modality == "video": |
118 | 123 | return ["Tell me what you see in the video briefly."], \ |
119 | | - ["https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4"] |
| 124 | + [str(test_data_root / "OAI-sora-tokyo-walk.mp4")] |
120 | 125 | elif modality == "mixture_text_image": |
121 | 126 | return ["Describe the scene in the image briefly.", |
122 | 127 | "Who invented the internet?"], \ |
123 | | - ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png", |
| 128 | + [str(test_data_root / "inpaint.png"), |
124 | 129 | ""] |
125 | 130 | elif modality == "text": |
126 | 131 | return ["Who invented the internet?"], [] |
|
0 commit comments