diff --git a/scripts/e2e_eval/cache/baseline_cache.json b/scripts/e2e_eval/cache/baseline_cache.json index 013d8acdc..033a5b376 100644 --- a/scripts/e2e_eval/cache/baseline_cache.json +++ b/scripts/e2e_eval/cache/baseline_cache.json @@ -579,6 +579,86 @@ "elapsed": 449.7, "command": "python.exe run_pytorch_baseline.py --model swin-large-patch4-window7-224 --task image-classification --device cpu --num-samples 1000 --dataset mini-imagenet --split test" }, + "facebook/dinov2-small|image-feature-extraction|timm/mini-imagenet||test|1000": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 86.2, + "num_samples": 1000 + }, + "elapsed": 141.5, + "command": "python.exe run_pytorch_baseline.py --model dinov2-small --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test" + }, + "facebook/dinov2-base|image-feature-extraction|timm/mini-imagenet||test|1000": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 89.5, + "num_samples": 1000 + }, + "elapsed": 316.5, + "command": "python.exe run_pytorch_baseline.py --model dinov2-base --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test" + }, + "facebook/dino-vits16|image-feature-extraction|timm/mini-imagenet||test|1000": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 79.7, + "num_samples": 1000 + }, + "elapsed": 94.2, + "command": "python.exe run_pytorch_baseline.py --model dino-vits16 --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test" + }, + "facebook/dino-vitb16|image-feature-extraction|timm/mini-imagenet||test|1000": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 83.2, + "num_samples": 1000 + }, + "elapsed": 248.6, + "command": "python.exe run_pytorch_baseline.py --model dino-vitb16 --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test" + }, + "google/vit-base-patch16-224-in21k|image-feature-extraction|timm/mini-imagenet||test|1000": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 91.9, + "num_samples": 1000 + }, + "elapsed": 231.2, + "command": "python.exe run_pytorch_baseline.py --model vit-base-patch16-224-in21k --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test" + }, + "StanfordAIMI/dinov2-base-xray-224|image-feature-extraction|Ewakaa/pneumonia_classification_chest_xray||test|582": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 93.6426, + "num_samples": 582 + }, + "elapsed": 166.0, + "command": "python.exe run_pytorch_baseline.py --model dinov2-base-xray-224 --task image-feature-extraction --device cpu --num-samples 582 --dataset pneumonia_classification_chest_xray --split test" + }, + "facebook/dinov2-large|image-feature-extraction|timm/mini-imagenet||test|1000": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 91.1, + "num_samples": 1000 + }, + "elapsed": 890.6, + "command": "python.exe run_pytorch_baseline.py --model dinov2-large --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test" + }, + "microsoft/rad-dino|image-feature-extraction|Ewakaa/pneumonia_classification_chest_xray||test|582": { + "status": "PASS", + "metric": { + "metric": "knn_top1_accuracy", + "value": 94.6735, + "num_samples": 582 + }, + "elapsed": 894.7, + "command": "python.exe run_pytorch_baseline.py --model rad-dino --task image-feature-extraction --device cpu --num-samples 582 --dataset pneumonia_classification_chest_xray --split test" + }, "google-bert/bert-base-uncased|fill-mask|Salesforce/wikitext|wikitext-2-raw-v1|test|100": { "status": "PASS", "metric": { diff --git a/scripts/e2e_eval/run_pytorch_baseline.py b/scripts/e2e_eval/run_pytorch_baseline.py index d9c3bda4f..8b74bd023 100644 --- a/scripts/e2e_eval/run_pytorch_baseline.py +++ b/scripts/e2e_eval/run_pytorch_baseline.py @@ -63,6 +63,7 @@ def _emit_result(metric: str, value: float, num_samples: int) -> None: "image-segmentation": "mean_iou", "feature-extraction": "cosine_spearman", "sentence-similarity": "cosine_spearman", + "image-feature-extraction": "knn_top1_accuracy", "fill-mask": "pseudo_perplexity", } @@ -75,8 +76,8 @@ def _emit_result(metric: str, value: float, num_samples: int) -> None: def _load_pytorch_model(model_id: str, task: str, device_str: str): """Load a native PyTorch model with the task-appropriate AutoModel class.""" import torch - from transformers import AutoConfig + from transformers import AutoConfig from winml.modelkit.loader.task import resolve_task_and_model_class config = AutoConfig.from_pretrained(model_id) diff --git a/scripts/e2e_eval/testsets/models_with_acc.json b/scripts/e2e_eval/testsets/models_with_acc.json index 44aabe069..01ae3614f 100644 --- a/scripts/e2e_eval/testsets/models_with_acc.json +++ b/scripts/e2e_eval/testsets/models_with_acc.json @@ -1015,6 +1015,118 @@ } } }, + { + "hf_id": "facebook/dinov2-small", + "task": "image-feature-extraction", + "model_type": "dinov2", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "timm/mini-imagenet", + "split": "test", + "samples": 1000, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, + { + "hf_id": "facebook/dinov2-base", + "task": "image-feature-extraction", + "model_type": "dinov2", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "timm/mini-imagenet", + "split": "test", + "samples": 1000, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, + { + "hf_id": "facebook/dinov2-large", + "task": "image-feature-extraction", + "model_type": "dinov2", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "timm/mini-imagenet", + "split": "test", + "samples": 1000, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, + { + "hf_id": "facebook/dino-vits16", + "task": "image-feature-extraction", + "model_type": "vit", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "timm/mini-imagenet", + "split": "test", + "samples": 1000, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, + { + "hf_id": "facebook/dino-vitb16", + "task": "image-feature-extraction", + "model_type": "vit", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "timm/mini-imagenet", + "split": "test", + "samples": 1000, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, + { + "hf_id": "google/vit-base-patch16-224-in21k", + "task": "image-feature-extraction", + "model_type": "vit", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "timm/mini-imagenet", + "split": "test", + "samples": 1000, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, + { + "hf_id": "microsoft/rad-dino", + "task": "image-feature-extraction", + "model_type": "dinov2", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "Ewakaa/pneumonia_classification_chest_xray", + "split": "test", + "samples": 582, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, + { + "hf_id": "StanfordAIMI/dinov2-base-xray-224", + "task": "image-feature-extraction", + "model_type": "dinov2", + "group": "Top200", + "priority": "P1", + "dataset_config": { + "path": "Ewakaa/pneumonia_classification_chest_xray", + "split": "test", + "samples": 582, + "metric": "knn_top1_accuracy", + "winml_metric_key": "knn_top1_accuracy" + } + }, { "hf_id": "google-bert/bert-base-uncased", "task": "fill-mask", diff --git a/scripts/e2e_eval/utils/accuracy.py b/scripts/e2e_eval/utils/accuracy.py index a07882e35..e1996a03d 100644 --- a/scripts/e2e_eval/utils/accuracy.py +++ b/scripts/e2e_eval/utils/accuracy.py @@ -44,6 +44,8 @@ class AccuracyVerdict(str, Enum): # False = smaller value is better (WER, loss) METRIC_COMPARE_STRATEGY: dict[str, tuple[str, float, float, bool]] = { "cosine_spearman": ("delta_absolute", 2.0, 4.0, True), + # WinML-vs-baseline delta is small — pick a tighter threshold than default. + "knn_top1_accuracy": ("delta_relative", 0.02, 0.05, True), "pseudo_perplexity": ("delta_relative", 0.05, 0.10, False), "default": ("delta_relative", 0.05, 0.10, True), # 5% and 10% } diff --git a/src/winml/modelkit/eval/__init__.py b/src/winml/modelkit/eval/__init__.py index 2b8536c2f..7066eb126 100644 --- a/src/winml/modelkit/eval/__init__.py +++ b/src/winml/modelkit/eval/__init__.py @@ -13,7 +13,9 @@ from .evaluate import EvalResult, evaluate from .feature_extraction_evaluator import WinMLFeatureExtractionEvaluator from .fill_mask_evaluator import WinMLFillMaskEvaluator +from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator +from .metrics.knn_accuracy import KNNAccuracyMetric from .metrics.mean_average_precision import MAPMetric from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric from .metrics.pseudo_perplexity import PseudoPerplexityMetric @@ -27,6 +29,7 @@ __all__ = [ "IGNORE_INDEX", "EvalResult", + "KNNAccuracyMetric", "MAPMetric", "MeanIoUMetric", "PseudoPerplexityMetric", @@ -35,6 +38,7 @@ "WinMLEvaluator", "WinMLFeatureExtractionEvaluator", "WinMLFillMaskEvaluator", + "WinMLImageFeatureExtractionEvaluator", "WinMLImageSegmentationEvaluator", "WinMLObjectDetectionEvaluator", "WinMLQuestionAnsweringEvaluator", diff --git a/src/winml/modelkit/eval/evaluate.py b/src/winml/modelkit/eval/evaluate.py index deba9e1d1..f068d174e 100644 --- a/src/winml/modelkit/eval/evaluate.py +++ b/src/winml/modelkit/eval/evaluate.py @@ -18,6 +18,7 @@ from .config import WinMLEvaluationConfig from .feature_extraction_evaluator import WinMLFeatureExtractionEvaluator from .fill_mask_evaluator import WinMLFillMaskEvaluator +from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator from .object_detection_evaluator import WinMLObjectDetectionEvaluator from .question_answering_evaluator import WinMLQuestionAnsweringEvaluator @@ -40,6 +41,7 @@ "question-answering": WinMLQuestionAnsweringEvaluator, "feature-extraction": WinMLFeatureExtractionEvaluator, "sentence-similarity": WinMLFeatureExtractionEvaluator, + "image-feature-extraction": WinMLImageFeatureExtractionEvaluator, "fill-mask": WinMLFillMaskEvaluator, } @@ -109,6 +111,12 @@ ), "feature-extraction": _FE_DEFAULT, "sentence-similarity": _FE_DEFAULT, + "image-feature-extraction": DatasetConfig( + path="timm/mini-imagenet", + split="test", + samples=1000, + shuffle=True, + ), "fill-mask": DatasetConfig( path="Salesforce/wikitext", name="wikitext-2-raw-v1", diff --git a/src/winml/modelkit/eval/image_feature_extraction_evaluator.py b/src/winml/modelkit/eval/image_feature_extraction_evaluator.py new file mode 100644 index 000000000..0832f87a8 --- /dev/null +++ b/src/winml/modelkit/eval/image_feature_extraction_evaluator.py @@ -0,0 +1,130 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Image feature extraction evaluator using kNN classification accuracy. + +Evaluates image embedding models (e.g. DINOv2, DINO, ViT-in21k) by: + 1. Extracting the CLS token embedding for each image via the pipeline. + 2. Running a leave-one-out k-Nearest Neighbor classifier. + 3. Reporting kNN top-1 and top-5 accuracy. + +Pipeline output contract (HF image-feature-extraction): + pipe(image) -> [[[float, ...]]] shape: [1, num_tokens, hidden_dim] + The first token (index 0) is the CLS token — the image-level embedding. + +Ground-truth dataset (default: timm/mini-imagenet): + {"image": PIL.Image, "label": ClassLabel} +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import numpy as np +from tqdm import tqdm + +from .base_evaluator import WinMLEvaluator + + +if TYPE_CHECKING: + from datasets import Dataset + from transformers.pipelines.base import Pipeline + + from ..datasets.config import DatasetConfig + from ..models.winml.base import WinMLPreTrainedModel + from .config import WinMLEvaluationConfig + + +class WinMLImageFeatureExtractionEvaluator(WinMLEvaluator): + """Evaluator for image feature extraction using kNN classification accuracy.""" + + @classmethod + def schema_info(cls) -> list: + """Return expected dataset schema for image feature extraction evaluation.""" + from .config import SchemaColumn + + return [ + SchemaColumn("image", "Image", "input_column", description="PIL Image"), + SchemaColumn( + "label", "ClassLabel", "label_column", + description="integer class label", + ), + ] + + def __init__( + self, + config: WinMLEvaluationConfig, + model: WinMLPreTrainedModel, + ) -> None: + mapping = config.dataset.columns_mapping + self._label_col = mapping.get("label_column", "label") + super().__init__(config, model) + + def prepare_pipeline(self) -> Pipeline: + """Create pipeline and match image processor size to ONNX input shape.""" + pipe = super().prepare_pipeline() + + io_config = getattr(self.model, "io_config", None) or {} + input_shapes = io_config.get("input_shapes", []) + if input_shapes and len(input_shapes[0]) == 4: + _, _, h, w = input_shapes[0] + pipe.image_processor.size = {"height": h, "width": w} + + return pipe + + def align_labels(self, dataset: Dataset, ds_config: DatasetConfig) -> Dataset: + """No-op: kNN uses dataset labels directly, no model-side label mapping.""" + return dataset + + def compute(self) -> dict[str, Any]: + """Run kNN evaluation and return accuracy metrics.""" + from .metrics.knn_accuracy import KNNAccuracyMetric + + embeddings: list[np.ndarray] = [] + labels: list[int] = [] + + for sample in tqdm(self.data, desc="Embedding images", unit="img"): + image = sample.get("image") + label = sample.get(self._label_col) + + if image is None or label is None: + continue + + raw = self.pipe(image) + embeddings.append(self._extract_image_embedding(raw)) + labels.append(int(label)) + + if len(embeddings) < 2: + raise ValueError( + f"Need at least 2 valid samples for kNN evaluation, got {len(embeddings)}." + ) + + embeddings_array = np.array(embeddings) + labels_array = np.array(labels) + + metric = KNNAccuracyMetric(k=10) + return metric.compute(embeddings_array, labels_array) + + @staticmethod + def _extract_image_embedding(raw: Any) -> np.ndarray: + """Reduce a pipeline output to a single 1D image-level embedding vector. + + Supports the two output shapes produced by HF ``image-feature-extraction`` + for transformer vision encoders (ViT / DINOv2 / DINO / BEiT / CLIP-ViT): + - ``[1, num_tokens, hidden]`` (default, ``pool=False``): take CLS + token at index 0 — the canonical image-level embedding. + - ``[1, hidden]`` (``pool=True`` or a model with a projection head): + use as-is. + """ + tokens = np.asarray(raw[0]) + if tokens.ndim == 1: + return tokens + if tokens.ndim == 2: + # CLS token (index 0) — standard image-level embedding for ViT/DINOv2. + return tokens[0] + raise ValueError( + f"Unsupported image-feature-extraction output shape: {np.asarray(raw).shape}. " + "Expected [1, hidden] (pooled) or [1, num_tokens, hidden] (token sequence)." + ) diff --git a/src/winml/modelkit/eval/metrics/__init__.py b/src/winml/modelkit/eval/metrics/__init__.py index 6964891ce..59500ae35 100644 --- a/src/winml/modelkit/eval/metrics/__init__.py +++ b/src/winml/modelkit/eval/metrics/__init__.py @@ -5,6 +5,7 @@ """Evaluation metrics.""" +from .knn_accuracy import KNNAccuracyMetric from .mean_average_precision import MAPMetric from .mean_iou import IGNORE_INDEX, MeanIoUMetric from .pseudo_perplexity import PseudoPerplexityMetric @@ -13,6 +14,7 @@ __all__ = [ "IGNORE_INDEX", + "KNNAccuracyMetric", "MAPMetric", "MeanIoUMetric", "PseudoPerplexityMetric", diff --git a/src/winml/modelkit/eval/metrics/knn_accuracy.py b/src/winml/modelkit/eval/metrics/knn_accuracy.py new file mode 100644 index 000000000..7cffee56d --- /dev/null +++ b/src/winml/modelkit/eval/metrics/knn_accuracy.py @@ -0,0 +1,154 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""k-Nearest Neighbor accuracy metric for image feature extraction. + +Evaluates embedding quality by using a leave-one-out kNN classifier: + 1. For each sample, find its k nearest neighbors by cosine similarity. + 2. Predict label via distance-weighted majority vote among neighbors. + 3. Report top-1 and top-5 kNN classification accuracy. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np + + +_DEFAULT_K = 10 + + +class KNNAccuracyMetric: + """k-Nearest Neighbor classification accuracy on embeddings. + + Typical usage:: + + metric = KNNAccuracyMetric(k=10) + result = metric.compute(embeddings, labels) + # {"knn_top1_accuracy": 72.5, "knn_top5_accuracy": 91.3} + """ + + def __init__(self, k: int = _DEFAULT_K) -> None: + self.k = k + + def compute( + self, + embeddings: np.ndarray, + labels: np.ndarray, + ) -> dict[str, Any]: + """Compute kNN accuracy. + + Args: + embeddings: (N, D) float array of L2-normalized embeddings. + labels: (N,) int array of ground-truth class labels. + + Returns: + Dict with ``knn_top1_accuracy`` and ``knn_top5_accuracy`` + as percentages in [0, 100]. + + Raises: + ValueError: If fewer than 2 samples or k < 1. + """ + n = len(embeddings) + if n < 2: + raise ValueError(f"At least 2 samples required for kNN, got {n}.") + if self.k < 1: + raise ValueError(f"k must be >= 1, got {self.k}.") + + k = min(self.k, n - 1) + top1_predictions, top5_predictions = self._predict_labels( + embeddings, labels, k, + ) + return self._compute_accuracy(top1_predictions, top5_predictions, labels) + + def _predict_labels( + self, + embeddings: np.ndarray, + labels: np.ndarray, + k: int, + ) -> tuple[np.ndarray, list[list[int]]]: + """Predict labels via kNN weighted voting. + + Args: + embeddings: (N, D) float array. + labels: (N,) int array of ground-truth class labels (used as + neighbor labels for voting, not for accuracy). + k: Number of neighbors to use. + + Returns: + Tuple of (top1_predictions, top5_predictions): + - top1_predictions: (N,) int array of predicted labels. + - top5_predictions: list of N lists, each containing up to 5 + class labels ranked by vote weight (descending). + """ + # L2-normalize (no-op if already normalized, safe either way) + norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + norms = np.maximum(norms, 1e-9) + embeddings = embeddings / norms + + # Cosine similarity matrix: (N, N) + similarity = embeddings @ embeddings.T + + # Exclude self-similarity + np.fill_diagonal(similarity, -np.inf) + + # Top-k neighbor indices per sample + # argpartition is O(N) per row vs O(N log N) for full sort + top_k_indices = np.argpartition(similarity, -k, axis=1)[:, -k:] + + n = len(embeddings) + top1_predictions = np.empty(n, dtype=np.int64) + top5_predictions: list[list[int]] = [] + + for i in range(n): + neighbor_idx = top_k_indices[i] + neighbor_sims = similarity[i, neighbor_idx] + neighbor_labels = labels[neighbor_idx] + + # Sort neighbors by similarity (descending) + sorted_order = np.argsort(-neighbor_sims) + sorted_labels = neighbor_labels[sorted_order] + sorted_sims = neighbor_sims[sorted_order] + + # Weighted vote + vote_weights: dict[int, float] = {} + for label, sim in zip(sorted_labels, sorted_sims, strict=True): + label_int = int(label) + vote_weights[label_int] = vote_weights.get(label_int, 0.0) + float(sim) + + ranked = sorted(vote_weights, key=lambda c: vote_weights[c], reverse=True) + top1_predictions[i] = ranked[0] + top5_predictions.append(ranked[:5]) + + return top1_predictions, top5_predictions + + @staticmethod + def _compute_accuracy( + top1_predictions: np.ndarray, + top5_predictions: list[list[int]], + labels: np.ndarray, + ) -> dict[str, Any]: + """Compute top-1 and top-5 accuracy from predictions. + + Args: + top1_predictions: (N,) int array of predicted labels. + top5_predictions: list of N lists of up to 5 candidate labels. + labels: (N,) int array of ground-truth labels. + + Returns: + Dict with ``knn_top1_accuracy`` and ``knn_top5_accuracy`` + as percentages in [0, 100]. + """ + n = len(labels) + top1_correct = int(np.sum(top1_predictions == labels)) + top5_correct = sum( + int(labels[i]) in top5_predictions[i] for i in range(n) + ) + + top1_acc = round(top1_correct / n * 100, 4) + top5_acc = round(top5_correct / n * 100, 4) + + return {"knn_top1_accuracy": top1_acc, "knn_top5_accuracy": top5_acc} diff --git a/src/winml/modelkit/models/winml/__init__.py b/src/winml/modelkit/models/winml/__init__.py index f82959b09..ea6069a17 100644 --- a/src/winml/modelkit/models/winml/__init__.py +++ b/src/winml/modelkit/models/winml/__init__.py @@ -48,6 +48,7 @@ "fill-mask": "WinMLModelForMaskedLM", "feature-extraction": "WinMLModelForFeatureExtraction", "sentence-similarity": "WinMLModelForFeatureExtraction", + "image-feature-extraction": "WinMLModelForFeatureExtraction", } # Level 2: (model_type, task) -> Specialized class (exceptions only) diff --git a/tests/unit/eval/test_image_feature_extraction_evaluator.py b/tests/unit/eval/test_image_feature_extraction_evaluator.py new file mode 100644 index 000000000..d6333442d --- /dev/null +++ b/tests/unit/eval/test_image_feature_extraction_evaluator.py @@ -0,0 +1,295 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""Unit tests for WinMLImageFeatureExtractionEvaluator and KNNAccuracyMetric.""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest + +from winml.modelkit.eval import KNNAccuracyMetric, WinMLImageFeatureExtractionEvaluator + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def make_evaluator(columns_mapping=None): + """Instantiate evaluator by patching external dependencies.""" + from winml.modelkit.datasets import DatasetConfig + from winml.modelkit.eval import WinMLEvaluationConfig + + mapping = columns_mapping or {} + + mock_ds = MagicMock() + mock_ds.__len__ = lambda self: 10 + mock_ds.shuffle.return_value = mock_ds + mock_ds.select.return_value = mock_ds + mock_ds.column_names = ["image", "label"] + + mock_pipe = MagicMock() + mock_pipe.image_processor = MagicMock() + + model = MagicMock() + model.config.label2id = None + model.io_config = {} + + config = WinMLEvaluationConfig( + model_id="test/model", + task="image-feature-extraction", + dataset=DatasetConfig(path="timm/mini-imagenet", columns_mapping=mapping), + ) + + with patch("datasets.load_dataset", return_value=mock_ds), \ + patch("transformers.pipeline", return_value=mock_pipe): + return WinMLImageFeatureExtractionEvaluator(config, model) + + +# --------------------------------------------------------------------------- +# KNNAccuracyMetric +# --------------------------------------------------------------------------- + +class TestKNNAccuracyMetric: + def test_perfect_clusters(self): + """Embeddings from the same class are identical -> 100% accuracy.""" + metric = KNNAccuracyMetric(k=3) + # 4 samples, 2 classes. Class 0 at origin-ish, class 1 far away. + embeddings = np.array([ + [1.0, 0.0, 0.0], + [0.99, 0.01, 0.0], + [0.0, 0.0, 1.0], + [0.01, 0.0, 0.99], + ]) + labels = np.array([0, 0, 1, 1]) + result = metric.compute(embeddings, labels) + assert result["knn_top1_accuracy"] == 100.0 + assert result["knn_top5_accuracy"] == 100.0 + + def test_random_embeddings_returns_valid_range(self): + """Random embeddings should still return accuracy in [0, 100].""" + rng = np.random.RandomState(42) + metric = KNNAccuracyMetric(k=5) + embeddings = rng.randn(50, 32) + labels = rng.randint(0, 5, size=50) + result = metric.compute(embeddings, labels) + assert 0.0 <= result["knn_top1_accuracy"] <= 100.0 + assert 0.0 <= result["knn_top5_accuracy"] <= 100.0 + + def test_k_capped_to_n_minus_1(self): + """k should be capped when larger than N-1.""" + metric = KNNAccuracyMetric(k=100) + embeddings = np.array([ + [1.0, 0.0], + [0.9, 0.1], + [0.0, 1.0], + ]) + labels = np.array([0, 0, 1]) + # Should not raise, k capped to 2 + result = metric.compute(embeddings, labels) + assert "knn_top1_accuracy" in result + + def test_too_few_samples_raises(self): + metric = KNNAccuracyMetric(k=5) + with pytest.raises(ValueError, match="At least 2 samples"): + metric.compute(np.array([[1.0]]), np.array([0])) + + def test_k_less_than_1_raises(self): + metric = KNNAccuracyMetric(k=0) + with pytest.raises(ValueError, match="k must be >= 1"): + metric.compute(np.array([[1.0], [2.0]]), np.array([0, 1])) + + def test_returns_float(self): + metric = KNNAccuracyMetric(k=2) + embeddings = np.array([[1.0, 0.0], [0.9, 0.1], [0.0, 1.0], [0.1, 0.9]]) + labels = np.array([0, 0, 1, 1]) + result = metric.compute(embeddings, labels) + assert isinstance(result["knn_top1_accuracy"], float) + assert isinstance(result["knn_top5_accuracy"], float) + + def test_two_samples_minimal(self): + """Smallest valid case: two samples.""" + metric = KNNAccuracyMetric(k=1) + embeddings = np.array([[1.0, 0.0], [0.0, 1.0]]) + labels = np.array([0, 1]) + result = metric.compute(embeddings, labels) + # With only 1 neighbor each, both predict the other's label + assert result["knn_top1_accuracy"] == 0.0 + + +# --------------------------------------------------------------------------- +# WinMLImageFeatureExtractionEvaluator +# --------------------------------------------------------------------------- + +class TestImageFeatureExtractionEvaluatorSchema: + def test_schema_has_image_and_label(self): + schema = WinMLImageFeatureExtractionEvaluator.schema_info() + names = [col.name for col in schema] + assert "image" in names + assert "label" in names + + def test_schema_column_types(self): + schema = WinMLImageFeatureExtractionEvaluator.schema_info() + type_map = {col.name: col.type for col in schema} + assert type_map["image"] == "Image" + assert type_map["label"] == "ClassLabel" + + +class TestImageFeatureExtractionEvaluatorInit: + def test_default_label_column(self): + evaluator = make_evaluator() + assert evaluator._label_col == "label" + + def test_custom_label_column(self): + evaluator = make_evaluator(columns_mapping={"label_column": "category"}) + assert evaluator._label_col == "category" + + +class TestImageFeatureExtractionEvaluatorAlignLabels: + def test_align_labels_is_noop(self): + evaluator = make_evaluator() + mock_dataset = MagicMock() + mock_ds_config = MagicMock() + result = evaluator.align_labels(mock_dataset, mock_ds_config) + assert result is mock_dataset + + +class TestExtractImageEmbedding: + """Tests for `_extract_image_embedding` across supported output shapes.""" + + def test_tokens_3d_returns_cls(self): + # [1, num_tokens, hidden] — ViT/DINOv2 default (pool=False). + raw = np.arange(1 * 4 * 8, dtype=np.float32).reshape(1, 4, 8) + out = WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw) + assert out.shape == (8,) + np.testing.assert_array_equal(out, raw[0, 0]) + + def test_pooled_2d_returns_vector(self): + # [1, hidden] — pooled / projected output. + raw = np.arange(16, dtype=np.float32).reshape(1, 16) + out = WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw) + assert out.shape == (16,) + np.testing.assert_array_equal(out, raw[0]) + + def test_nested_list_input_supported(self): + # HF pipeline typically returns nested Python lists. + raw = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]] # [1, 2, 3] + out = WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw) + assert out.shape == (3,) + np.testing.assert_array_equal(out, np.array([1.0, 2.0, 3.0])) + + def test_cnn_feature_map_raises(self): + # [1, C, H, W] — not supported, surface error instead of silent bad output. + raw = np.zeros((1, 8, 3, 3), dtype=np.float32) + with pytest.raises(ValueError, match="Unsupported"): + WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw) + + def test_scalar_raises(self): + raw = [np.float32(1.0)] + with pytest.raises(ValueError, match="Unsupported"): + WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw) + + +class TestImageFeatureExtractionEvaluatorRegistry: + def test_registered_in_evaluator_registry(self): + from winml.modelkit.eval.evaluate import _EVALUATOR_REGISTRY + + assert "image-feature-extraction" in _EVALUATOR_REGISTRY + assert ( + _EVALUATOR_REGISTRY["image-feature-extraction"] + is WinMLImageFeatureExtractionEvaluator + ) + + def test_default_dataset_registered(self): + from winml.modelkit.eval.evaluate import _DEFAULT_DATASETS + + assert "image-feature-extraction" in _DEFAULT_DATASETS + ds = _DEFAULT_DATASETS["image-feature-extraction"] + assert ds.path == "timm/mini-imagenet" + assert ds.samples == 1000 + + +# --------------------------------------------------------------------------- +# WinMLImageFeatureExtractionEvaluator.compute +# --------------------------------------------------------------------------- + +class TestCompute: + """End-to-end: pipeline output -> CLS extraction -> kNN metric.""" + + @staticmethod + def _token_sequence(cls_vec: list[float], num_tokens: int = 3) -> list: + """Build a [1, num_tokens, hidden] pipeline output with given CLS vec.""" + hidden = len(cls_vec) + # Non-CLS tokens are arbitrary — only index 0 should be used. + other = [[0.5] * hidden for _ in range(num_tokens - 1)] + return [[cls_vec, *other]] + + def test_end_to_end_flow(self): + """Pipeline tokens -> CLS extraction -> kNN produces valid accuracies.""" + ev = make_evaluator() + + # Two well-separated clusters, two samples each. + cluster_a = [1.0, 0.0, 0.0] + cluster_b = [0.0, 1.0, 0.0] + ev.data = [ + {"image": "img1", "label": 0}, + {"image": "img2", "label": 0}, + {"image": "img3", "label": 1}, + {"image": "img4", "label": 1}, + ] + outputs = iter([ + self._token_sequence(cluster_a), + self._token_sequence([0.99, 0.01, 0.0]), + self._token_sequence(cluster_b), + self._token_sequence([0.01, 0.99, 0.0]), + ]) + ev.pipe = MagicMock(side_effect=lambda _img: next(outputs)) + + result = ev.compute() + + assert "knn_top1_accuracy" in result + assert "knn_top5_accuracy" in result + # Perfectly separable clusters -> 100% top-1. + assert result["knn_top1_accuracy"] == 100.0 + + def test_skips_samples_with_none_image_or_label(self): + """Samples missing image or label are dropped before embedding.""" + ev = make_evaluator() + + ev.data = [ + {"image": "img1", "label": 0}, + {"image": None, "label": 0}, # skipped + {"image": "img2", "label": None}, # skipped + {"image": "img3", "label": 1}, + ] + outputs = iter([ + self._token_sequence([1.0, 0.0]), + self._token_sequence([0.0, 1.0]), + ]) + ev.pipe = MagicMock(side_effect=lambda _img: next(outputs)) + + result = ev.compute() + + # Pipe should only be called for valid samples. + assert ev.pipe.call_count == 2 + assert "knn_top1_accuracy" in result + + def test_raises_when_fewer_than_two_valid_samples(self): + """ValueError is raised if <2 valid samples remain after filtering.""" + ev = make_evaluator() + + ev.data = [ + {"image": "img1", "label": 0}, + {"image": None, "label": 0}, + ] + ev.pipe = MagicMock( + return_value=self._token_sequence([1.0, 0.0]) + ) + + with pytest.raises(ValueError, match="at least 2 valid samples"): + ev.compute() +