diff --git a/src/winml/modelkit/inspect/resolver.py b/src/winml/modelkit/inspect/resolver.py index a55edf12b..51a6a9ccd 100644 --- a/src/winml/modelkit/inspect/resolver.py +++ b/src/winml/modelkit/inspect/resolver.py @@ -10,7 +10,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, NamedTuple from ..loader.task import ( HF_TASK_DEFAULTS, @@ -869,20 +869,22 @@ def resolve_processor( # This is fast and doesn't require downloading/instantiating processors # NOTE: These JSON keys (processor_class, image_processor_type, etc.) are # standard HuggingFace config conventions, not model-specific hardcoding. + has_preprocessor_config = True try: - hub_proc, hub_tok, hub_img, hub_fe = _resolve_processor_from_hub_configs(model_id) - if hub_proc and processor_class is None: - processor_class = hub_proc + hub_result = _resolve_processor_from_hub_configs(model_id) + if hub_result.processor_class and processor_class is None: + processor_class = hub_result.processor_class processor_source = "hub_config" - if hub_tok and tokenizer_class is None: - tokenizer_class = hub_tok + if hub_result.tokenizer_class and tokenizer_class is None: + tokenizer_class = hub_result.tokenizer_class tokenizer_source = "hub_config" - if hub_img and image_processor_class is None: - image_processor_class = hub_img + if hub_result.image_processor_class and image_processor_class is None: + image_processor_class = hub_result.image_processor_class image_processor_source = "hub_config" - if hub_fe and feature_extractor_class is None: - feature_extractor_class = hub_fe + if hub_result.feature_extractor_class and feature_extractor_class is None: + feature_extractor_class = hub_result.feature_extractor_class feature_extractor_source = "hub_config" + has_preprocessor_config = hub_result.has_preprocessor_config except Exception as e: logger.debug("Failed to resolve processors from hub configs: %s", e) @@ -890,10 +892,14 @@ def resolve_processor( # Skip entirely when Strategies 0 + 1 already populated every field — # each Auto* instantiation does its own HF Hub I/O plus class init # (AutoProcessor and AutoFeatureExtractor are several seconds each). + # + # When ``preprocessor_config.json`` is missing on the hub, the model + # has neither an image processor nor a feature extractor; skip those + # two Auto* round-trips (they would each spend ~1s confirming a 404). need_processor = processor_class is None need_tokenizer = tokenizer_class is None - need_image_processor = image_processor_class is None - need_feature_extractor = feature_extractor_class is None + need_image_processor = image_processor_class is None and has_preprocessor_config + need_feature_extractor = feature_extractor_class is None and has_preprocessor_config if need_processor or need_tokenizer or need_image_processor or need_feature_extractor: try: @@ -938,9 +944,21 @@ def resolve_processor( ) -def _resolve_processor_from_hub_configs( - model_id: str, -) -> tuple[str | None, str | None, str | None, str | None]: +class _HubConfigResult(NamedTuple): + """Result of ``_resolve_processor_from_hub_configs``. + + A NamedTuple rather than a plain tuple so the trailing boolean cannot be + silently swapped with the four ``str | None`` fields at the call site. + """ + + processor_class: str | None + tokenizer_class: str | None + image_processor_class: str | None + feature_extractor_class: str | None + has_preprocessor_config: bool + + +def _resolve_processor_from_hub_configs(model_id: str) -> _HubConfigResult: """Resolve processor classes by fetching config files from HuggingFace Hub. This approach is fast because it only downloads small JSON config files, @@ -950,7 +968,12 @@ def _resolve_processor_from_hub_configs( model_id: HuggingFace model identifier Returns: - Tuple of (processor_class, tokenizer_class, image_processor_class, feature_extractor_class) + A ``_HubConfigResult`` whose ``has_preprocessor_config`` reports + whether ``preprocessor_config.json`` actually exists on the hub — + the authoritative signal that the model has no image processor or + feature extractor, so the caller can skip the corresponding + ``AutoImageProcessor`` / ``AutoFeatureExtractor`` round-trips + (which would each spend ~1s confirming a 404 on text-only models). """ import json from pathlib import Path @@ -962,6 +985,7 @@ def _resolve_processor_from_hub_configs( tokenizer_class: str | None = None image_processor_class: str | None = None feature_extractor_class: str | None = None + has_preprocessor_config = False # Try to download and parse preprocessor_config.json # This file contains image_processor_type or processor_class @@ -970,6 +994,11 @@ def _resolve_processor_from_hub_configs( repo_id=model_id, filename="preprocessor_config.json", ) + # Set the flag as soon as the file exists on the hub, *before* parsing. + # A corrupt JSON is still proof that the model ships preprocessor + # config — fall back to Auto* lookups rather than declaring the model + # text-only and silently dropping its image/feature processor. + has_preprocessor_config = True with Path(preprocessor_config_path).open(encoding="utf-8") as f: preprocessor_config = json.load(f) @@ -1009,7 +1038,34 @@ def _resolve_processor_from_hub_configs( except json.JSONDecodeError as e: logger.debug("Failed to parse tokenizer_config.json for %s: %s", model_id, e) - return processor_class, tokenizer_class, image_processor_class, feature_extractor_class + return _HubConfigResult( + processor_class=processor_class, + tokenizer_class=tokenizer_class, + image_processor_class=image_processor_class, + feature_extractor_class=feature_extractor_class, + has_preprocessor_config=has_preprocessor_config, + ) + + +def _is_tokenizer_class_name(name: str) -> bool: + """Heuristic: does this transformers class name look like a tokenizer? + + Tokenizer classes follow the ``*Tokenizer`` / ``*TokenizerFast`` naming + convention (e.g. ``RobertaTokenizer``, ``BertTokenizerFast``). Used to + detect when ``AutoProcessor.from_pretrained`` returned a leaf tokenizer + rather than a multimodal ``ProcessorMixin`` wrapper. + """ + return name.endswith(("Tokenizer", "TokenizerFast")) + + +def _is_image_processor_class_name(name: str) -> bool: + """Heuristic: does this transformers class name look like an image processor?""" + return name.endswith(("ImageProcessor", "ImageProcessorFast")) + + +def _is_feature_extractor_class_name(name: str) -> bool: + """Heuristic: does this transformers class name look like a feature extractor?""" + return name.endswith("FeatureExtractor") def _resolve_processor_from_auto_classes( @@ -1058,28 +1114,31 @@ def _resolve_processor_from_auto_classes( processor = AutoProcessor.from_pretrained(model_id, use_fast=True) processor_class = type(processor).__name__ - # AutoProcessor may wrap tokenizer and image_processor - if ( - try_tokenizer - and hasattr(processor, "tokenizer") - and processor.tokenizer is not None - ): - tokenizer_class = type(processor.tokenizer).__name__ - - if ( - try_image_processor - and hasattr(processor, "image_processor") - and processor.image_processor is not None - ): - image_processor_class = type(processor.image_processor).__name__ - - # Some older models use feature_extractor instead of image_processor - if ( - try_feature_extractor - and hasattr(processor, "feature_extractor") - and processor.feature_extractor is not None - ): - feature_extractor_class = type(processor.feature_extractor).__name__ + # AutoProcessor may wrap tokenizer / image_processor / feature_extractor + # as a multimodal `ProcessorMixin`. For single-modality models it + # often returns the leaf class directly (e.g. RoBERTa → + # `RobertaTokenizerFast`), which has none of those attributes. + # Pattern-match the returned class name so the standalone Auto* + # calls below can be skipped — otherwise we pay for a second, + # redundant load (~2s for AutoTokenizer on warm cache). + wrapped_tokenizer = getattr(processor, "tokenizer", None) + wrapped_image_processor = getattr(processor, "image_processor", None) + wrapped_feature_extractor = getattr(processor, "feature_extractor", None) + + if try_tokenizer and wrapped_tokenizer is not None: + tokenizer_class = type(wrapped_tokenizer).__name__ + elif try_tokenizer and _is_tokenizer_class_name(processor_class): + tokenizer_class = processor_class + + if try_image_processor and wrapped_image_processor is not None: + image_processor_class = type(wrapped_image_processor).__name__ + elif try_image_processor and _is_image_processor_class_name(processor_class): + image_processor_class = processor_class + + if try_feature_extractor and wrapped_feature_extractor is not None: + feature_extractor_class = type(wrapped_feature_extractor).__name__ + elif try_feature_extractor and _is_feature_extractor_class_name(processor_class): + feature_extractor_class = processor_class except Exception as e: logger.debug("AutoProcessor failed for %s: %s", model_id, e) diff --git a/tests/unit/inspect/test_resolve_processor_gating.py b/tests/unit/inspect/test_resolve_processor_gating.py index 02e115c31..5a27dcd0f 100644 --- a/tests/unit/inspect/test_resolve_processor_gating.py +++ b/tests/unit/inspect/test_resolve_processor_gating.py @@ -20,14 +20,21 @@ from unittest.mock import MagicMock, patch from winml.modelkit.inspect.resolver import ( + _HubConfigResult, _resolve_processor_from_auto_classes, resolve_processor, ) -def _all_filled_hub_result() -> tuple[str, str, str, str]: - """Strategy 1 returns all four processor types.""" - return ("BertProcessor", "BertTokenizer", "BertImageProcessor", "BertFeatureExtractor") +def _all_filled_hub_result() -> _HubConfigResult: + """Strategy 1 returns all four processor types + preprocessor_config present.""" + return _HubConfigResult( + processor_class="BertProcessor", + tokenizer_class="BertTokenizer", + image_processor_class="BertImageProcessor", + feature_extractor_class="BertFeatureExtractor", + has_preprocessor_config=True, + ) class TestResolveProcessorStrategy2Gating: @@ -56,7 +63,13 @@ def test_strategy2_skipped_when_all_fields_filled_by_strategy1(self) -> None: def test_strategy2_called_with_per_field_flags(self) -> None: """Only the fields still missing after Strategy 1 should have try_*=True.""" # Strategy 1 fills only image_processor and feature_extractor. - hub_result = (None, None, "ConvNextImageProcessor", "ConvNextFeatureExtractor") + hub_result = _HubConfigResult( + processor_class=None, + tokenizer_class=None, + image_processor_class="ConvNextImageProcessor", + feature_extractor_class="ConvNextFeatureExtractor", + has_preprocessor_config=True, + ) with ( patch( @@ -79,10 +92,17 @@ def test_strategy2_called_with_per_field_flags(self) -> None: def test_strategy2_runs_when_nothing_filled(self) -> None: """Empty Strategy-1 result → Strategy 2 runs with every flag True.""" + empty_hub_result = _HubConfigResult( + processor_class=None, + tokenizer_class=None, + image_processor_class=None, + feature_extractor_class=None, + has_preprocessor_config=True, + ) with ( patch( "winml.modelkit.inspect.resolver._resolve_processor_from_hub_configs", - return_value=(None, None, None, None), + return_value=empty_hub_result, ), # Block Strategy 0 (HF registry) by passing no model_type below patch( @@ -101,6 +121,46 @@ def test_strategy2_runs_when_nothing_filled(self) -> None: assert info.processor_class == "P" assert info.feature_extractor_class == "F" + def test_missing_preprocessor_config_skips_image_and_feature(self) -> None: + """preprocessor_config.json absent → skip AutoImageProcessor & AutoFeatureExtractor. + + Text-only models (RoBERTa, BERT, GPT, ...) don't ship a + preprocessor_config.json. Without this gate, Strategy 2 spends + ~2s confirming 404s for both AutoImageProcessor and + AutoFeatureExtractor. The hub_configs helper now reports the + file's existence so the caller can skip those lookups. + """ + hub_result = _HubConfigResult( + processor_class=None, + tokenizer_class=None, + image_processor_class=None, + feature_extractor_class=None, + has_preprocessor_config=False, + ) + + with ( + patch( + "winml.modelkit.inspect.resolver._resolve_processor_from_hub_configs", + return_value=hub_result, + ), + patch( + "winml.modelkit.inspect.resolver._resolve_processor_from_auto_classes", + return_value=(None, None, None, None), + ) as mock_auto, + ): + resolve_processor("text/model") + + assert mock_auto.call_count == 1 + kwargs = mock_auto.call_args.kwargs + assert kwargs["try_processor"] is True + assert kwargs["try_tokenizer"] is True + assert kwargs["try_image_processor"] is False, ( + "Must skip AutoImageProcessor when preprocessor_config.json is absent" + ) + assert kwargs["try_feature_extractor"] is False, ( + "Must skip AutoFeatureExtractor when preprocessor_config.json is absent" + ) + class TestAutoProcessorGatedOnTryProcessor: """When ``try_processor=False`` we skip AutoProcessor entirely. @@ -164,3 +224,163 @@ def test_try_processor_true_still_calls_autoprocessor(self) -> None: ) assert mock_ap.call_count == 1 + + +class TestAutoProcessorLeafClassDetection: + """``AutoProcessor.from_pretrained`` may return a leaf processor. + + For text-only models (RoBERTa, BERT, ...) ``AutoProcessor`` returns + the tokenizer directly — e.g. ``RobertaTokenizerFast``. Without + recognising this we would re-load the tokenizer via the standalone + ``AutoTokenizer.from_pretrained`` below at ~2s of extra cost. + """ + + @staticmethod + def _make_leaf_instance(class_name: str) -> object: + """Build an instance whose ``type(obj).__name__`` is ``class_name``. + + Plain instance — no ``.tokenizer`` / ``.image_processor`` / + ``.feature_extractor`` attributes — so the leaf-class detection + branch is what matches. + """ + return type(class_name, (), {})() + + def test_autoprocessor_returns_tokenizer_fills_tokenizer_class(self) -> None: + """When AutoProcessor returns a *Tokenizer*, tokenizer_class is populated + and standalone AutoTokenizer is NOT called. + """ + fake = self._make_leaf_instance("RobertaTokenizerFast") + + with ( + patch("transformers.AutoProcessor.from_pretrained", return_value=fake), + patch("transformers.AutoTokenizer.from_pretrained") as mock_at, + patch("transformers.AutoImageProcessor.from_pretrained"), + patch("transformers.AutoFeatureExtractor.from_pretrained"), + ): + proc, tok, _img, _feat = _resolve_processor_from_auto_classes( + "some/text-model", + try_processor=True, + try_tokenizer=True, + try_image_processor=False, + try_feature_extractor=False, + ) + + assert proc == "RobertaTokenizerFast" + assert tok == "RobertaTokenizerFast" + assert mock_at.call_count == 0, ( + "Standalone AutoTokenizer must be skipped when AutoProcessor " + "already returned a *Tokenizer* leaf class" + ) + + def test_autoprocessor_returns_image_processor_fills_image_class(self) -> None: + """AutoProcessor returning a *ImageProcessor* fills image_processor_class.""" + fake = self._make_leaf_instance("ConvNextImageProcessor") + + with ( + patch("transformers.AutoProcessor.from_pretrained", return_value=fake), + patch("transformers.AutoImageProcessor.from_pretrained") as mock_aip, + ): + proc, _, img, _ = _resolve_processor_from_auto_classes( + "some/vision-model", + try_processor=True, + try_tokenizer=False, + try_image_processor=True, + try_feature_extractor=False, + ) + + assert proc == "ConvNextImageProcessor" + assert img == "ConvNextImageProcessor" + assert mock_aip.call_count == 0 + + def test_autoprocessor_returns_feature_extractor_fills_feature_class(self) -> None: + """AutoProcessor returning a *FeatureExtractor* fills feature_extractor_class.""" + fake = self._make_leaf_instance("Wav2Vec2FeatureExtractor") + + with ( + patch("transformers.AutoProcessor.from_pretrained", return_value=fake), + patch("transformers.AutoFeatureExtractor.from_pretrained") as mock_afe, + ): + proc, _, _, feat = _resolve_processor_from_auto_classes( + "some/audio-model", + try_processor=True, + try_tokenizer=False, + try_image_processor=False, + try_feature_extractor=True, + ) + + assert proc == "Wav2Vec2FeatureExtractor" + assert feat == "Wav2Vec2FeatureExtractor" + assert mock_afe.call_count == 0 + + def test_autoprocessor_with_wrapped_pieces_uses_attributes(self) -> None: + """Multimodal AutoProcessor (real ProcessorMixin) wins over name suffix.""" + + class CLIPTokenizer: + pass + + class CLIPProcessor: + def __init__(self) -> None: + self.tokenizer = CLIPTokenizer() + + with ( + patch( + "transformers.AutoProcessor.from_pretrained", + return_value=CLIPProcessor(), + ), + patch("transformers.AutoTokenizer.from_pretrained") as mock_at, + ): + proc, tok, _, _ = _resolve_processor_from_auto_classes( + "openai/clip-vit-base-patch32", + try_processor=True, + try_tokenizer=True, + try_image_processor=False, + try_feature_extractor=False, + ) + + assert proc == "CLIPProcessor" + assert tok == "CLIPTokenizer" + assert mock_at.call_count == 0 + + def test_unrecognized_leaf_falls_through_to_standalone_autos(self) -> None: + """Leaf class with an unrecognized suffix → standalone Auto* fills the gaps. + + ``SpeechT5Processor`` ends in ``Processor`` but none of the + ``_is_tokenizer_class_name`` / ``_is_image_processor_class_name`` / + ``_is_feature_extractor_class_name`` heuristics match it. The + leaf-class shortcut leaves ``tokenizer_class`` / ``image_processor_class`` + / ``feature_extractor_class`` as ``None``, and the standalone + ``Auto*`` calls below fill them in — documenting the graceful + fallback when the suffix heuristic does not match. + """ + fake = self._make_leaf_instance("SpeechT5Processor") + + fake_tok = type("SomeTokenizer", (), {})() + fake_feat = type("SomeFeatureExtractor", (), {})() + + with ( + patch("transformers.AutoProcessor.from_pretrained", return_value=fake), + patch( + "transformers.AutoTokenizer.from_pretrained", + return_value=fake_tok, + ) as mock_at, + patch("transformers.AutoImageProcessor.from_pretrained") as mock_aip, + patch( + "transformers.AutoFeatureExtractor.from_pretrained", + return_value=fake_feat, + ) as mock_afe, + ): + proc, tok, _img, feat = _resolve_processor_from_auto_classes( + "microsoft/speecht5_tts", + try_processor=True, + try_tokenizer=True, + try_image_processor=False, + try_feature_extractor=True, + ) + + assert proc == "SpeechT5Processor" + # Suffix didn't match any leaf-class heuristic → standalone Auto* must run. + assert tok == "SomeTokenizer" + assert feat == "SomeFeatureExtractor" + assert mock_at.call_count == 1 + assert mock_aip.call_count == 0 # gated off by try_image_processor=False + assert mock_afe.call_count == 1