From 1d2a0d4adef2a760a14981de87eebefab291b228 Mon Sep 17 00:00:00 2001 From: Chenhe Gu Date: Thu, 29 Jan 2026 16:46:25 +0800 Subject: [PATCH 1/3] fix mm processor --- slime/utils/processing_utils.py | 48 ++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/slime/utils/processing_utils.py b/slime/utils/processing_utils.py index e48d46571..1b61068f6 100644 --- a/slime/utils/processing_utils.py +++ b/slime/utils/processing_utils.py @@ -1,6 +1,7 @@ import base64 import io import logging +import types from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin @@ -11,11 +12,35 @@ # Reference: https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/README.md DEFAULT_PATCH_SIZE = 14 +_qwen_process_vision_info = None + def load_tokenizer(name_or_path: str, **kwargs): return AutoTokenizer.from_pretrained(name_or_path, **kwargs) +def _patch_processor_call(processor: ProcessorMixin): + """Patch processor.__call__ to inject default kwargs for modality-specific processors.""" + original_call = processor.__call__ + + def patched_call(self, *args, **kwargs): + # force return_tensors to None for input_ids + kwargs["return_tensors"] = None + # have been resized by qwen_vl_utils, update this when supporting other models + kwargs["do_resize"] = False + + # set return_tensors="pt" for modality-specific outputs + for modality_kwargs_key in ("audio_kwargs", "images_kwargs", "videos_kwargs"): + if modality_kwargs_key not in kwargs: + kwargs[modality_kwargs_key] = {"return_tensors": "pt"} + else: + kwargs[modality_kwargs_key].setdefault("return_tensors", "pt") + + return original_call(*args, **kwargs) + + processor.__call__ = types.MethodType(patched_call, processor) + + def load_processor(name_or_path: str, **kwargs): try: proc = AutoProcessor.from_pretrained(name_or_path, **kwargs) @@ -27,19 +52,33 @@ def load_processor(name_or_path: str, **kwargs): if isinstance(proc, PreTrainedTokenizerBase) or not isinstance(proc, ProcessorMixin): proc = None + # Patch processor __call__ to add default kwargs + if proc is not None: + _patch_processor_call(proc) + + global _qwen_process_vision_info + if _qwen_process_vision_info is None: + try: + from qwen_vl_utils import process_vision_info as _fn + + _qwen_process_vision_info = _fn + except ImportError: + logger.warning("qwen_vl_utils not installed, process_vision_info will not work") + return proc def process_vision_info(prompt, processor): - # temporary solution, will write image utils for slime later - from qwen_vl_utils import process_vision_info + # TODO: temporary solution, will write image utils for slime later + if _qwen_process_vision_info is None: + raise ImportError("qwen_vl_utils is not installed. Install it with: pip install qwen-vl-utils") if hasattr(processor.image_processor, "patch_size"): image_patch_size = processor.image_processor.patch_size else: logger.info(f"Using default patch size: {DEFAULT_PATCH_SIZE}") image_patch_size = DEFAULT_PATCH_SIZE - images, videos = process_vision_info(prompt, image_patch_size=image_patch_size) + images, videos = _qwen_process_vision_info(prompt, image_patch_size=image_patch_size) multimodal_inputs = {"images": images, "videos": videos} return multimodal_inputs @@ -50,4 +89,5 @@ def encode_image_for_rollout_engine(image) -> str: if image.mode != "RGB": image = image.convert("RGB") image.save(buffer, format="PNG") - return base64.b64encode(buffer.getvalue()).decode("utf-8") + image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8") + return f"data:image/png;base64,{image_base64}" From c0d1f3d66f31c95d592868ac95f0c77b599dcbd3 Mon Sep 17 00:00:00 2001 From: Chenhe Gu Date: Thu, 29 Jan 2026 17:11:59 +0800 Subject: [PATCH 2/3] update --- slime/rollout/sglang_rollout.py | 10 ++++++++-- slime/utils/processing_utils.py | 34 ++++++++++++++++----------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/slime/rollout/sglang_rollout.py b/slime/rollout/sglang_rollout.py index a0cc52fb6..a9f6e094b 100644 --- a/slime/rollout/sglang_rollout.py +++ b/slime/rollout/sglang_rollout.py @@ -20,7 +20,12 @@ from slime.utils.eval_config import EvalDatasetConfig from slime.utils.http_utils import get, post from slime.utils.misc import SingletonMeta, load_function -from slime.utils.processing_utils import encode_image_for_rollout_engine, load_processor, load_tokenizer +from slime.utils.processing_utils import ( + build_processor_kwargs, + encode_image_for_rollout_engine, + load_processor, + load_tokenizer, +) from slime.utils.types import Sample from .rm_hub import async_rm, batched_async_rm @@ -112,7 +117,8 @@ async def generate(args: Namespace, sample: Sample, sampling_params: dict[str, A ), f"Sample status is {sample.status}" if state.processor: - processor_output = state.processor(text=sample.prompt, **sample.multimodal_inputs) + processor_kwargs = build_processor_kwargs(sample.multimodal_inputs) + processor_output = state.processor(text=sample.prompt, **processor_kwargs) prompt_ids = processor_output["input_ids"][0] sample.multimodal_train_inputs = { k: v for k, v in processor_output.items() if k not in ["input_ids", "attention_mask"] diff --git a/slime/utils/processing_utils.py b/slime/utils/processing_utils.py index 1b61068f6..9ffd3346c 100644 --- a/slime/utils/processing_utils.py +++ b/slime/utils/processing_utils.py @@ -1,7 +1,6 @@ import base64 import io import logging -import types from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin @@ -19,26 +18,28 @@ def load_tokenizer(name_or_path: str, **kwargs): return AutoTokenizer.from_pretrained(name_or_path, **kwargs) -def _patch_processor_call(processor: ProcessorMixin): - """Patch processor.__call__ to inject default kwargs for modality-specific processors.""" - original_call = processor.__call__ +def build_processor_kwargs(multimodal_inputs: dict | None = None) -> dict: - def patched_call(self, *args, **kwargs): + forced = { # force return_tensors to None for input_ids - kwargs["return_tensors"] = None + "return_tensors": None, # have been resized by qwen_vl_utils, update this when supporting other models - kwargs["do_resize"] = False + "do_resize": False, + } + modality_forced = {"return_tensors": "pt"} - # set return_tensors="pt" for modality-specific outputs - for modality_kwargs_key in ("audio_kwargs", "images_kwargs", "videos_kwargs"): - if modality_kwargs_key not in kwargs: - kwargs[modality_kwargs_key] = {"return_tensors": "pt"} - else: - kwargs[modality_kwargs_key].setdefault("return_tensors", "pt") + result = dict(multimodal_inputs) if multimodal_inputs else {} - return original_call(*args, **kwargs) + result.update(forced) - processor.__call__ = types.MethodType(patched_call, processor) + # set return_tensors="pt" for modality-specific outputs + for key in ("audio_kwargs", "images_kwargs", "videos_kwargs"): + if key in result: + result[key] = {**result[key], **modality_forced} + else: + result[key] = modality_forced.copy() + + return result def load_processor(name_or_path: str, **kwargs): @@ -52,10 +53,7 @@ def load_processor(name_or_path: str, **kwargs): if isinstance(proc, PreTrainedTokenizerBase) or not isinstance(proc, ProcessorMixin): proc = None - # Patch processor __call__ to add default kwargs if proc is not None: - _patch_processor_call(proc) - global _qwen_process_vision_info if _qwen_process_vision_info is None: try: From 870a0117694a472e2e6d7009c8325212b214e3e2 Mon Sep 17 00:00:00 2001 From: Chenhe Gu Date: Tue, 3 Feb 2026 20:14:45 +0800 Subject: [PATCH 3/3] update --- slime/utils/processing_utils.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/slime/utils/processing_utils.py b/slime/utils/processing_utils.py index 9ffd3346c..18aa27997 100644 --- a/slime/utils/processing_utils.py +++ b/slime/utils/processing_utils.py @@ -11,8 +11,6 @@ # Reference: https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/README.md DEFAULT_PATCH_SIZE = 14 -_qwen_process_vision_info = None - def load_tokenizer(name_or_path: str, **kwargs): return AutoTokenizer.from_pretrained(name_or_path, **kwargs) @@ -23,8 +21,6 @@ def build_processor_kwargs(multimodal_inputs: dict | None = None) -> dict: forced = { # force return_tensors to None for input_ids "return_tensors": None, - # have been resized by qwen_vl_utils, update this when supporting other models - "do_resize": False, } modality_forced = {"return_tensors": "pt"} @@ -53,30 +49,19 @@ def load_processor(name_or_path: str, **kwargs): if isinstance(proc, PreTrainedTokenizerBase) or not isinstance(proc, ProcessorMixin): proc = None - if proc is not None: - global _qwen_process_vision_info - if _qwen_process_vision_info is None: - try: - from qwen_vl_utils import process_vision_info as _fn - - _qwen_process_vision_info = _fn - except ImportError: - logger.warning("qwen_vl_utils not installed, process_vision_info will not work") - return proc def process_vision_info(prompt, processor): # TODO: temporary solution, will write image utils for slime later - if _qwen_process_vision_info is None: - raise ImportError("qwen_vl_utils is not installed. Install it with: pip install qwen-vl-utils") + from qwen_vl_utils import process_vision_info as qwen_process_vision_info if hasattr(processor.image_processor, "patch_size"): image_patch_size = processor.image_processor.patch_size else: logger.info(f"Using default patch size: {DEFAULT_PATCH_SIZE}") image_patch_size = DEFAULT_PATCH_SIZE - images, videos = _qwen_process_vision_info(prompt, image_patch_size=image_patch_size) + images, videos = qwen_process_vision_info(prompt, image_patch_size=image_patch_size) multimodal_inputs = {"images": images, "videos": videos} return multimodal_inputs