microsoft · timenick · May 26, 2026 · May 26, 2026 · May 26, 2026
@@ -10,7 +10,7 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, NamedTuple
 
 from ..loader.task import (
     HF_TASK_DEFAULTS,
@@ -869,31 +869,37 @@ def resolve_processor(
     # This is fast and doesn't require downloading/instantiating processors
     # NOTE: These JSON keys (processor_class, image_processor_type, etc.) are
     # standard HuggingFace config conventions, not model-specific hardcoding.
+    has_preprocessor_config = True
     try:
-        hub_proc, hub_tok, hub_img, hub_fe = _resolve_processor_from_hub_configs(model_id)
-        if hub_proc and processor_class is None:
-            processor_class = hub_proc
+        hub_result = _resolve_processor_from_hub_configs(model_id)
+        if hub_result.processor_class and processor_class is None:
+            processor_class = hub_result.processor_class
             processor_source = "hub_config"
-        if hub_tok and tokenizer_class is None:
-            tokenizer_class = hub_tok
+        if hub_result.tokenizer_class and tokenizer_class is None:
+            tokenizer_class = hub_result.tokenizer_class
             tokenizer_source = "hub_config"
-        if hub_img and image_processor_class is None:
-            image_processor_class = hub_img
+        if hub_result.image_processor_class and image_processor_class is None:
+            image_processor_class = hub_result.image_processor_class
             image_processor_source = "hub_config"
-        if hub_fe and feature_extractor_class is None:
-            feature_extractor_class = hub_fe
+        if hub_result.feature_extractor_class and feature_extractor_class is None:
+            feature_extractor_class = hub_result.feature_extractor_class
             feature_extractor_source = "hub_config"
+        has_preprocessor_config = hub_result.has_preprocessor_config
     except Exception as e:
         logger.debug("Failed to resolve processors from hub configs: %s", e)
 
     # Strategy 2: Use Auto classes to fill in any missing information.
     # Skip entirely when Strategies 0 + 1 already populated every field —
     # each Auto* instantiation does its own HF Hub I/O plus class init
     # (AutoProcessor and AutoFeatureExtractor are several seconds each).
+    #
+    # When ``preprocessor_config.json`` is missing on the hub, the model
+    # has neither an image processor nor a feature extractor; skip those
+    # two Auto* round-trips (they would each spend ~1s confirming a 404).
     need_processor = processor_class is None
     need_tokenizer = tokenizer_class is None
-    need_image_processor = image_processor_class is None
-    need_feature_extractor = feature_extractor_class is None
+    need_image_processor = image_processor_class is None and has_preprocessor_config
+    need_feature_extractor = feature_extractor_class is None and has_preprocessor_config
 
     if need_processor or need_tokenizer or need_image_processor or need_feature_extractor:
         try:
@@ -938,9 +944,21 @@ def resolve_processor(
     )
 
 
-def _resolve_processor_from_hub_configs(
-    model_id: str,
-) -> tuple[str | None, str | None, str | None, str | None]:
+class _HubConfigResult(NamedTuple):
+    """Result of ``_resolve_processor_from_hub_configs``.
+
+    A NamedTuple rather than a plain tuple so the trailing boolean cannot be
+    silently swapped with the four ``str | None`` fields at the call site.
+    """
+
+    processor_class: str | None
+    tokenizer_class: str | None
+    image_processor_class: str | None
+    feature_extractor_class: str | None
+    has_preprocessor_config: bool
+
+
+def _resolve_processor_from_hub_configs(model_id: str) -> _HubConfigResult:
     """Resolve processor classes by fetching config files from HuggingFace Hub.
 
     This approach is fast because it only downloads small JSON config files,
@@ -950,7 +968,12 @@ def _resolve_processor_from_hub_configs(
         model_id: HuggingFace model identifier
 
     Returns:
-        Tuple of (processor_class, tokenizer_class, image_processor_class, feature_extractor_class)
+        A ``_HubConfigResult`` whose ``has_preprocessor_config`` reports
+        whether ``preprocessor_config.json`` actually exists on the hub —
+        the authoritative signal that the model has no image processor or
+        feature extractor, so the caller can skip the corresponding
+        ``AutoImageProcessor`` / ``AutoFeatureExtractor`` round-trips
+        (which would each spend ~1s confirming a 404 on text-only models).
     """
     import json
     from pathlib import Path
@@ -962,6 +985,7 @@ def _resolve_processor_from_hub_configs(
     tokenizer_class: str | None = None
     image_processor_class: str | None = None
     feature_extractor_class: str | None = None
+    has_preprocessor_config = False
 
     # Try to download and parse preprocessor_config.json
     # This file contains image_processor_type or processor_class
@@ -970,6 +994,11 @@ def _resolve_processor_from_hub_configs(
             repo_id=model_id,
             filename="preprocessor_config.json",
         )
+        # Set the flag as soon as the file exists on the hub, *before* parsing.
+        # A corrupt JSON is still proof that the model ships preprocessor
+        # config — fall back to Auto* lookups rather than declaring the model
+        # text-only and silently dropping its image/feature processor.
+        has_preprocessor_config = True
         with Path(preprocessor_config_path).open(encoding="utf-8") as f:
             preprocessor_config = json.load(f)
 
@@ -1009,7 +1038,34 @@ def _resolve_processor_from_hub_configs(
     except json.JSONDecodeError as e:
         logger.debug("Failed to parse tokenizer_config.json for %s: %s", model_id, e)
 
-    return processor_class, tokenizer_class, image_processor_class, feature_extractor_class
+    return _HubConfigResult(
+        processor_class=processor_class,
+        tokenizer_class=tokenizer_class,
+        image_processor_class=image_processor_class,
+        feature_extractor_class=feature_extractor_class,
+        has_preprocessor_config=has_preprocessor_config,
+    )
+
+
+def _is_tokenizer_class_name(name: str) -> bool:
+    """Heuristic: does this transformers class name look like a tokenizer?
+
+    Tokenizer classes follow the ``*Tokenizer`` / ``*TokenizerFast`` naming
+    convention (e.g. ``RobertaTokenizer``, ``BertTokenizerFast``). Used to
+    detect when ``AutoProcessor.from_pretrained`` returned a leaf tokenizer
+    rather than a multimodal ``ProcessorMixin`` wrapper.
+    """
+    return name.endswith(("Tokenizer", "TokenizerFast"))
+
+
+def _is_image_processor_class_name(name: str) -> bool:
+    """Heuristic: does this transformers class name look like an image processor?"""
+    return name.endswith(("ImageProcessor", "ImageProcessorFast"))
+
+
+def _is_feature_extractor_class_name(name: str) -> bool:
+    """Heuristic: does this transformers class name look like a feature extractor?"""
+    return name.endswith("FeatureExtractor")
 
 
 def _resolve_processor_from_auto_classes(
@@ -1058,28 +1114,31 @@ def _resolve_processor_from_auto_classes(
             processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
             processor_class = type(processor).__name__
 
-            # AutoProcessor may wrap tokenizer and image_processor
-            if (
-                try_tokenizer
-                and hasattr(processor, "tokenizer")
-                and processor.tokenizer is not None
-            ):
-                tokenizer_class = type(processor.tokenizer).__name__
-
-            if (
-                try_image_processor
-                and hasattr(processor, "image_processor")
-                and processor.image_processor is not None
-            ):
-                image_processor_class = type(processor.image_processor).__name__
-
-            # Some older models use feature_extractor instead of image_processor
-            if (
-                try_feature_extractor
-                and hasattr(processor, "feature_extractor")
-                and processor.feature_extractor is not None
-            ):
-                feature_extractor_class = type(processor.feature_extractor).__name__
+            # AutoProcessor may wrap tokenizer / image_processor / feature_extractor
+            # as a multimodal `ProcessorMixin`.  For single-modality models it
+            # often returns the leaf class directly (e.g. RoBERTa →
+            # `RobertaTokenizerFast`), which has none of those attributes.
+            # Pattern-match the returned class name so the standalone Auto*
+            # calls below can be skipped — otherwise we pay for a second,
+            # redundant load (~2s for AutoTokenizer on warm cache).
+            wrapped_tokenizer = getattr(processor, "tokenizer", None)
+            wrapped_image_processor = getattr(processor, "image_processor", None)
+            wrapped_feature_extractor = getattr(processor, "feature_extractor", None)
+
+            if try_tokenizer and wrapped_tokenizer is not None:
+                tokenizer_class = type(wrapped_tokenizer).__name__
+            elif try_tokenizer and _is_tokenizer_class_name(processor_class):
+                tokenizer_class = processor_class
+
+            if try_image_processor and wrapped_image_processor is not None:
+                image_processor_class = type(wrapped_image_processor).__name__
+            elif try_image_processor and _is_image_processor_class_name(processor_class):
+                image_processor_class = processor_class
+
+            if try_feature_extractor and wrapped_feature_extractor is not None:
+                feature_extractor_class = type(wrapped_feature_extractor).__name__
+            elif try_feature_extractor and _is_feature_extractor_class_name(processor_class):
+                feature_extractor_class = processor_class
 
         except Exception as e:
             logger.debug("AutoProcessor failed for %s: %s", model_id, e)