diff --git a/scripts/e2e_eval/cache/baseline_cache.json b/scripts/e2e_eval/cache/baseline_cache.json
index 013d8acdc..033a5b376 100644
--- a/scripts/e2e_eval/cache/baseline_cache.json
+++ b/scripts/e2e_eval/cache/baseline_cache.json
@@ -579,6 +579,86 @@
     "elapsed": 449.7,
     "command": "python.exe run_pytorch_baseline.py --model swin-large-patch4-window7-224 --task image-classification --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
   },
+  "facebook/dinov2-small|image-feature-extraction|timm/mini-imagenet||test|1000": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 86.2,
+      "num_samples": 1000
+    },
+    "elapsed": 141.5,
+    "command": "python.exe run_pytorch_baseline.py --model dinov2-small --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
+  },
+  "facebook/dinov2-base|image-feature-extraction|timm/mini-imagenet||test|1000": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 89.5,
+      "num_samples": 1000
+    },
+    "elapsed": 316.5,
+    "command": "python.exe run_pytorch_baseline.py --model dinov2-base --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
+  },
+  "facebook/dino-vits16|image-feature-extraction|timm/mini-imagenet||test|1000": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 79.7,
+      "num_samples": 1000
+    },
+    "elapsed": 94.2,
+    "command": "python.exe run_pytorch_baseline.py --model dino-vits16 --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
+  },
+  "facebook/dino-vitb16|image-feature-extraction|timm/mini-imagenet||test|1000": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 83.2,
+      "num_samples": 1000
+    },
+    "elapsed": 248.6,
+    "command": "python.exe run_pytorch_baseline.py --model dino-vitb16 --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
+  },
+  "google/vit-base-patch16-224-in21k|image-feature-extraction|timm/mini-imagenet||test|1000": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 91.9,
+      "num_samples": 1000
+    },
+    "elapsed": 231.2,
+    "command": "python.exe run_pytorch_baseline.py --model vit-base-patch16-224-in21k --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
+  },
+  "StanfordAIMI/dinov2-base-xray-224|image-feature-extraction|Ewakaa/pneumonia_classification_chest_xray||test|582": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 93.6426,
+      "num_samples": 582
+    },
+    "elapsed": 166.0,
+    "command": "python.exe run_pytorch_baseline.py --model dinov2-base-xray-224 --task image-feature-extraction --device cpu --num-samples 582 --dataset pneumonia_classification_chest_xray --split test"
+  },
+  "facebook/dinov2-large|image-feature-extraction|timm/mini-imagenet||test|1000": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 91.1,
+      "num_samples": 1000
+    },
+    "elapsed": 890.6,
+    "command": "python.exe run_pytorch_baseline.py --model dinov2-large --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
+  },
+  "microsoft/rad-dino|image-feature-extraction|Ewakaa/pneumonia_classification_chest_xray||test|582": {
+    "status": "PASS",
+    "metric": {
+      "metric": "knn_top1_accuracy",
+      "value": 94.6735,
+      "num_samples": 582
+    },
+    "elapsed": 894.7,
+    "command": "python.exe run_pytorch_baseline.py --model rad-dino --task image-feature-extraction --device cpu --num-samples 582 --dataset pneumonia_classification_chest_xray --split test"
+  },
   "google-bert/bert-base-uncased|fill-mask|Salesforce/wikitext|wikitext-2-raw-v1|test|100": {
     "status": "PASS",
     "metric": {
diff --git a/scripts/e2e_eval/run_pytorch_baseline.py b/scripts/e2e_eval/run_pytorch_baseline.py
index d9c3bda4f..8b74bd023 100644
--- a/scripts/e2e_eval/run_pytorch_baseline.py
+++ b/scripts/e2e_eval/run_pytorch_baseline.py
@@ -63,6 +63,7 @@ def _emit_result(metric: str, value: float, num_samples: int) -> None:
     "image-segmentation": "mean_iou",
     "feature-extraction": "cosine_spearman",
     "sentence-similarity": "cosine_spearman",
+    "image-feature-extraction": "knn_top1_accuracy",
     "fill-mask": "pseudo_perplexity",
 }
 
@@ -75,8 +76,8 @@ def _emit_result(metric: str, value: float, num_samples: int) -> None:
 def _load_pytorch_model(model_id: str, task: str, device_str: str):
     """Load a native PyTorch model with the task-appropriate AutoModel class."""
     import torch
-    from transformers import AutoConfig
 
+    from transformers import AutoConfig
     from winml.modelkit.loader.task import resolve_task_and_model_class
 
     config = AutoConfig.from_pretrained(model_id)
diff --git a/scripts/e2e_eval/testsets/models_with_acc.json b/scripts/e2e_eval/testsets/models_with_acc.json
index 44aabe069..01ae3614f 100644
--- a/scripts/e2e_eval/testsets/models_with_acc.json
+++ b/scripts/e2e_eval/testsets/models_with_acc.json
@@ -1015,6 +1015,118 @@
       }
     }
   },
+  {
+    "hf_id": "facebook/dinov2-small",
+    "task": "image-feature-extraction",
+    "model_type": "dinov2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "timm/mini-imagenet",
+      "split": "test",
+      "samples": 1000,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
+  {
+    "hf_id": "facebook/dinov2-base",
+    "task": "image-feature-extraction",
+    "model_type": "dinov2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "timm/mini-imagenet",
+      "split": "test",
+      "samples": 1000,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
+  {
+    "hf_id": "facebook/dinov2-large",
+    "task": "image-feature-extraction",
+    "model_type": "dinov2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "timm/mini-imagenet",
+      "split": "test",
+      "samples": 1000,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
+  {
+    "hf_id": "facebook/dino-vits16",
+    "task": "image-feature-extraction",
+    "model_type": "vit",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "timm/mini-imagenet",
+      "split": "test",
+      "samples": 1000,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
+  {
+    "hf_id": "facebook/dino-vitb16",
+    "task": "image-feature-extraction",
+    "model_type": "vit",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "timm/mini-imagenet",
+      "split": "test",
+      "samples": 1000,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
+  {
+    "hf_id": "google/vit-base-patch16-224-in21k",
+    "task": "image-feature-extraction",
+    "model_type": "vit",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "timm/mini-imagenet",
+      "split": "test",
+      "samples": 1000,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
+  {
+    "hf_id": "microsoft/rad-dino",
+    "task": "image-feature-extraction",
+    "model_type": "dinov2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "Ewakaa/pneumonia_classification_chest_xray",
+      "split": "test",
+      "samples": 582,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
+  {
+    "hf_id": "StanfordAIMI/dinov2-base-xray-224",
+    "task": "image-feature-extraction",
+    "model_type": "dinov2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "Ewakaa/pneumonia_classification_chest_xray",
+      "split": "test",
+      "samples": 582,
+      "metric": "knn_top1_accuracy",
+      "winml_metric_key": "knn_top1_accuracy"
+    }
+  },
   {
     "hf_id": "google-bert/bert-base-uncased",
     "task": "fill-mask",
diff --git a/scripts/e2e_eval/utils/accuracy.py b/scripts/e2e_eval/utils/accuracy.py
index a07882e35..e1996a03d 100644
--- a/scripts/e2e_eval/utils/accuracy.py
+++ b/scripts/e2e_eval/utils/accuracy.py
@@ -44,6 +44,8 @@ class AccuracyVerdict(str, Enum):
 #                   False = smaller value is better (WER, loss)
 METRIC_COMPARE_STRATEGY: dict[str, tuple[str, float, float, bool]] = {
     "cosine_spearman": ("delta_absolute", 2.0, 4.0, True),
+    # WinML-vs-baseline delta is small — pick a tighter threshold than default.
+    "knn_top1_accuracy": ("delta_relative", 0.02, 0.05, True),
     "pseudo_perplexity": ("delta_relative", 0.05, 0.10, False),
     "default": ("delta_relative", 0.05, 0.10, True), # 5% and 10%
 }
diff --git a/src/winml/modelkit/eval/__init__.py b/src/winml/modelkit/eval/__init__.py
index 2b8536c2f..7066eb126 100644
--- a/src/winml/modelkit/eval/__init__.py
+++ b/src/winml/modelkit/eval/__init__.py
@@ -13,7 +13,9 @@
 from .evaluate import EvalResult, evaluate
 from .feature_extraction_evaluator import WinMLFeatureExtractionEvaluator
 from .fill_mask_evaluator import WinMLFillMaskEvaluator
+from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
 from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
+from .metrics.knn_accuracy import KNNAccuracyMetric
 from .metrics.mean_average_precision import MAPMetric
 from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric
 from .metrics.pseudo_perplexity import PseudoPerplexityMetric
@@ -27,6 +29,7 @@
 __all__ = [
     "IGNORE_INDEX",
     "EvalResult",
+    "KNNAccuracyMetric",
     "MAPMetric",
     "MeanIoUMetric",
     "PseudoPerplexityMetric",
@@ -35,6 +38,7 @@
     "WinMLEvaluator",
     "WinMLFeatureExtractionEvaluator",
     "WinMLFillMaskEvaluator",
+    "WinMLImageFeatureExtractionEvaluator",
     "WinMLImageSegmentationEvaluator",
     "WinMLObjectDetectionEvaluator",
     "WinMLQuestionAnsweringEvaluator",
diff --git a/src/winml/modelkit/eval/evaluate.py b/src/winml/modelkit/eval/evaluate.py
index deba9e1d1..f068d174e 100644
--- a/src/winml/modelkit/eval/evaluate.py
+++ b/src/winml/modelkit/eval/evaluate.py
@@ -18,6 +18,7 @@
 from .config import WinMLEvaluationConfig
 from .feature_extraction_evaluator import WinMLFeatureExtractionEvaluator
 from .fill_mask_evaluator import WinMLFillMaskEvaluator
+from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
 from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
 from .object_detection_evaluator import WinMLObjectDetectionEvaluator
 from .question_answering_evaluator import WinMLQuestionAnsweringEvaluator
@@ -40,6 +41,7 @@
     "question-answering": WinMLQuestionAnsweringEvaluator,
     "feature-extraction": WinMLFeatureExtractionEvaluator,
     "sentence-similarity": WinMLFeatureExtractionEvaluator,
+    "image-feature-extraction": WinMLImageFeatureExtractionEvaluator,
     "fill-mask": WinMLFillMaskEvaluator,
 }
 
@@ -109,6 +111,12 @@
     ),
     "feature-extraction": _FE_DEFAULT,
     "sentence-similarity": _FE_DEFAULT,
+    "image-feature-extraction": DatasetConfig(
+        path="timm/mini-imagenet",
+        split="test",
+        samples=1000,
+        shuffle=True,
+    ),
     "fill-mask": DatasetConfig(
         path="Salesforce/wikitext",
         name="wikitext-2-raw-v1",
diff --git a/src/winml/modelkit/eval/image_feature_extraction_evaluator.py b/src/winml/modelkit/eval/image_feature_extraction_evaluator.py
new file mode 100644
index 000000000..0832f87a8
--- /dev/null
+++ b/src/winml/modelkit/eval/image_feature_extraction_evaluator.py
@@ -0,0 +1,130 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Image feature extraction evaluator using kNN classification accuracy.
+
+Evaluates image embedding models (e.g. DINOv2, DINO, ViT-in21k) by:
+  1. Extracting the CLS token embedding for each image via the pipeline.
+  2. Running a leave-one-out k-Nearest Neighbor classifier.
+  3. Reporting kNN top-1 and top-5 accuracy.
+
+Pipeline output contract (HF image-feature-extraction):
+    pipe(image) -> [[[float, ...]]]   shape: [1, num_tokens, hidden_dim]
+    The first token (index 0) is the CLS token — the image-level embedding.
+
+Ground-truth dataset (default: timm/mini-imagenet):
+    {"image": PIL.Image, "label": ClassLabel}
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+from tqdm import tqdm
+
+from .base_evaluator import WinMLEvaluator
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset
+    from transformers.pipelines.base import Pipeline
+
+    from ..datasets.config import DatasetConfig
+    from ..models.winml.base import WinMLPreTrainedModel
+    from .config import WinMLEvaluationConfig
+
+
+class WinMLImageFeatureExtractionEvaluator(WinMLEvaluator):
+    """Evaluator for image feature extraction using kNN classification accuracy."""
+
+    @classmethod
+    def schema_info(cls) -> list:
+        """Return expected dataset schema for image feature extraction evaluation."""
+        from .config import SchemaColumn
+
+        return [
+            SchemaColumn("image", "Image", "input_column", description="PIL Image"),
+            SchemaColumn(
+                "label", "ClassLabel", "label_column",
+                description="integer class label",
+            ),
+        ]
+
+    def __init__(
+        self,
+        config: WinMLEvaluationConfig,
+        model: WinMLPreTrainedModel,
+    ) -> None:
+        mapping = config.dataset.columns_mapping
+        self._label_col = mapping.get("label_column", "label")
+        super().__init__(config, model)
+
+    def prepare_pipeline(self) -> Pipeline:
+        """Create pipeline and match image processor size to ONNX input shape."""
+        pipe = super().prepare_pipeline()
+
+        io_config = getattr(self.model, "io_config", None) or {}
+        input_shapes = io_config.get("input_shapes", [])
+        if input_shapes and len(input_shapes[0]) == 4:
+            _, _, h, w = input_shapes[0]
+            pipe.image_processor.size = {"height": h, "width": w}
+
+        return pipe
+
+    def align_labels(self, dataset: Dataset, ds_config: DatasetConfig) -> Dataset:
+        """No-op: kNN uses dataset labels directly, no model-side label mapping."""
+        return dataset
+
+    def compute(self) -> dict[str, Any]:
+        """Run kNN evaluation and return accuracy metrics."""
+        from .metrics.knn_accuracy import KNNAccuracyMetric
+
+        embeddings: list[np.ndarray] = []
+        labels: list[int] = []
+
+        for sample in tqdm(self.data, desc="Embedding images", unit="img"):
+            image = sample.get("image")
+            label = sample.get(self._label_col)
+
+            if image is None or label is None:
+                continue
+
+            raw = self.pipe(image)
+            embeddings.append(self._extract_image_embedding(raw))
+            labels.append(int(label))
+
+        if len(embeddings) < 2:
+            raise ValueError(
+                f"Need at least 2 valid samples for kNN evaluation, got {len(embeddings)}."
+            )
+
+        embeddings_array = np.array(embeddings)
+        labels_array = np.array(labels)
+
+        metric = KNNAccuracyMetric(k=10)
+        return metric.compute(embeddings_array, labels_array)
+
+    @staticmethod
+    def _extract_image_embedding(raw: Any) -> np.ndarray:
+        """Reduce a pipeline output to a single 1D image-level embedding vector.
+
+        Supports the two output shapes produced by HF ``image-feature-extraction``
+        for transformer vision encoders (ViT / DINOv2 / DINO / BEiT / CLIP-ViT):
+          - ``[1, num_tokens, hidden]`` (default, ``pool=False``): take CLS
+            token at index 0 — the canonical image-level embedding.
+          - ``[1, hidden]`` (``pool=True`` or a model with a projection head):
+            use as-is.
+        """
+        tokens = np.asarray(raw[0])
+        if tokens.ndim == 1:
+            return tokens
+        if tokens.ndim == 2:
+            # CLS token (index 0) — standard image-level embedding for ViT/DINOv2.
+            return tokens[0]
+        raise ValueError(
+            f"Unsupported image-feature-extraction output shape: {np.asarray(raw).shape}. "
+            "Expected [1, hidden] (pooled) or [1, num_tokens, hidden] (token sequence)."
+        )
diff --git a/src/winml/modelkit/eval/metrics/__init__.py b/src/winml/modelkit/eval/metrics/__init__.py
index 6964891ce..59500ae35 100644
--- a/src/winml/modelkit/eval/metrics/__init__.py
+++ b/src/winml/modelkit/eval/metrics/__init__.py
@@ -5,6 +5,7 @@
 
 """Evaluation metrics."""
 
+from .knn_accuracy import KNNAccuracyMetric
 from .mean_average_precision import MAPMetric
 from .mean_iou import IGNORE_INDEX, MeanIoUMetric
 from .pseudo_perplexity import PseudoPerplexityMetric
@@ -13,6 +14,7 @@
 
 __all__ = [
     "IGNORE_INDEX",
+    "KNNAccuracyMetric",
     "MAPMetric",
     "MeanIoUMetric",
     "PseudoPerplexityMetric",
diff --git a/src/winml/modelkit/eval/metrics/knn_accuracy.py b/src/winml/modelkit/eval/metrics/knn_accuracy.py
new file mode 100644
index 000000000..7cffee56d
--- /dev/null
+++ b/src/winml/modelkit/eval/metrics/knn_accuracy.py
@@ -0,0 +1,154 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""k-Nearest Neighbor accuracy metric for image feature extraction.
+
+Evaluates embedding quality by using a leave-one-out kNN classifier:
+  1. For each sample, find its k nearest neighbors by cosine similarity.
+  2. Predict label via distance-weighted majority vote among neighbors.
+  3. Report top-1 and top-5 kNN classification accuracy.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+
+_DEFAULT_K = 10
+
+
+class KNNAccuracyMetric:
+    """k-Nearest Neighbor classification accuracy on embeddings.
+
+    Typical usage::
+
+        metric = KNNAccuracyMetric(k=10)
+        result = metric.compute(embeddings, labels)
+        # {"knn_top1_accuracy": 72.5, "knn_top5_accuracy": 91.3}
+    """
+
+    def __init__(self, k: int = _DEFAULT_K) -> None:
+        self.k = k
+
+    def compute(
+        self,
+        embeddings: np.ndarray,
+        labels: np.ndarray,
+    ) -> dict[str, Any]:
+        """Compute kNN accuracy.
+
+        Args:
+            embeddings: (N, D) float array of L2-normalized embeddings.
+            labels: (N,) int array of ground-truth class labels.
+
+        Returns:
+            Dict with ``knn_top1_accuracy`` and ``knn_top5_accuracy``
+            as percentages in [0, 100].
+
+        Raises:
+            ValueError: If fewer than 2 samples or k < 1.
+        """
+        n = len(embeddings)
+        if n < 2:
+            raise ValueError(f"At least 2 samples required for kNN, got {n}.")
+        if self.k < 1:
+            raise ValueError(f"k must be >= 1, got {self.k}.")
+
+        k = min(self.k, n - 1)
+        top1_predictions, top5_predictions = self._predict_labels(
+            embeddings, labels, k,
+        )
+        return self._compute_accuracy(top1_predictions, top5_predictions, labels)
+
+    def _predict_labels(
+        self,
+        embeddings: np.ndarray,
+        labels: np.ndarray,
+        k: int,
+    ) -> tuple[np.ndarray, list[list[int]]]:
+        """Predict labels via kNN weighted voting.
+
+        Args:
+            embeddings: (N, D) float array.
+            labels: (N,) int array of ground-truth class labels (used as
+                neighbor labels for voting, not for accuracy).
+            k: Number of neighbors to use.
+
+        Returns:
+            Tuple of (top1_predictions, top5_predictions):
+                - top1_predictions: (N,) int array of predicted labels.
+                - top5_predictions: list of N lists, each containing up to 5
+                  class labels ranked by vote weight (descending).
+        """
+        # L2-normalize (no-op if already normalized, safe either way)
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+        norms = np.maximum(norms, 1e-9)
+        embeddings = embeddings / norms
+
+        # Cosine similarity matrix: (N, N)
+        similarity = embeddings @ embeddings.T
+
+        # Exclude self-similarity
+        np.fill_diagonal(similarity, -np.inf)
+
+        # Top-k neighbor indices per sample
+        # argpartition is O(N) per row vs O(N log N) for full sort
+        top_k_indices = np.argpartition(similarity, -k, axis=1)[:, -k:]
+
+        n = len(embeddings)
+        top1_predictions = np.empty(n, dtype=np.int64)
+        top5_predictions: list[list[int]] = []
+
+        for i in range(n):
+            neighbor_idx = top_k_indices[i]
+            neighbor_sims = similarity[i, neighbor_idx]
+            neighbor_labels = labels[neighbor_idx]
+
+            # Sort neighbors by similarity (descending)
+            sorted_order = np.argsort(-neighbor_sims)
+            sorted_labels = neighbor_labels[sorted_order]
+            sorted_sims = neighbor_sims[sorted_order]
+
+            # Weighted vote
+            vote_weights: dict[int, float] = {}
+            for label, sim in zip(sorted_labels, sorted_sims, strict=True):
+                label_int = int(label)
+                vote_weights[label_int] = vote_weights.get(label_int, 0.0) + float(sim)
+
+            ranked = sorted(vote_weights, key=lambda c: vote_weights[c], reverse=True)
+            top1_predictions[i] = ranked[0]
+            top5_predictions.append(ranked[:5])
+
+        return top1_predictions, top5_predictions
+
+    @staticmethod
+    def _compute_accuracy(
+        top1_predictions: np.ndarray,
+        top5_predictions: list[list[int]],
+        labels: np.ndarray,
+    ) -> dict[str, Any]:
+        """Compute top-1 and top-5 accuracy from predictions.
+
+        Args:
+            top1_predictions: (N,) int array of predicted labels.
+            top5_predictions: list of N lists of up to 5 candidate labels.
+            labels: (N,) int array of ground-truth labels.
+
+        Returns:
+            Dict with ``knn_top1_accuracy`` and ``knn_top5_accuracy``
+            as percentages in [0, 100].
+        """
+        n = len(labels)
+        top1_correct = int(np.sum(top1_predictions == labels))
+        top5_correct = sum(
+            int(labels[i]) in top5_predictions[i] for i in range(n)
+        )
+
+        top1_acc = round(top1_correct / n * 100, 4)
+        top5_acc = round(top5_correct / n * 100, 4)
+
+        return {"knn_top1_accuracy": top1_acc, "knn_top5_accuracy": top5_acc}
diff --git a/src/winml/modelkit/models/winml/__init__.py b/src/winml/modelkit/models/winml/__init__.py
index f82959b09..ea6069a17 100644
--- a/src/winml/modelkit/models/winml/__init__.py
+++ b/src/winml/modelkit/models/winml/__init__.py
@@ -48,6 +48,7 @@
     "fill-mask": "WinMLModelForMaskedLM",
     "feature-extraction": "WinMLModelForFeatureExtraction",
     "sentence-similarity": "WinMLModelForFeatureExtraction",
+    "image-feature-extraction": "WinMLModelForFeatureExtraction",
 }
 
 # Level 2: (model_type, task) -> Specialized class (exceptions only)
diff --git a/tests/unit/eval/test_image_feature_extraction_evaluator.py b/tests/unit/eval/test_image_feature_extraction_evaluator.py
new file mode 100644
index 000000000..d6333442d
--- /dev/null
+++ b/tests/unit/eval/test_image_feature_extraction_evaluator.py
@@ -0,0 +1,295 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Unit tests for WinMLImageFeatureExtractionEvaluator and KNNAccuracyMetric."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import pytest
+
+from winml.modelkit.eval import KNNAccuracyMetric, WinMLImageFeatureExtractionEvaluator
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def make_evaluator(columns_mapping=None):
+    """Instantiate evaluator by patching external dependencies."""
+    from winml.modelkit.datasets import DatasetConfig
+    from winml.modelkit.eval import WinMLEvaluationConfig
+
+    mapping = columns_mapping or {}
+
+    mock_ds = MagicMock()
+    mock_ds.__len__ = lambda self: 10
+    mock_ds.shuffle.return_value = mock_ds
+    mock_ds.select.return_value = mock_ds
+    mock_ds.column_names = ["image", "label"]
+
+    mock_pipe = MagicMock()
+    mock_pipe.image_processor = MagicMock()
+
+    model = MagicMock()
+    model.config.label2id = None
+    model.io_config = {}
+
+    config = WinMLEvaluationConfig(
+        model_id="test/model",
+        task="image-feature-extraction",
+        dataset=DatasetConfig(path="timm/mini-imagenet", columns_mapping=mapping),
+    )
+
+    with patch("datasets.load_dataset", return_value=mock_ds), \
+         patch("transformers.pipeline", return_value=mock_pipe):
+        return WinMLImageFeatureExtractionEvaluator(config, model)
+
+
+# ---------------------------------------------------------------------------
+# KNNAccuracyMetric
+# ---------------------------------------------------------------------------
+
+class TestKNNAccuracyMetric:
+    def test_perfect_clusters(self):
+        """Embeddings from the same class are identical -> 100% accuracy."""
+        metric = KNNAccuracyMetric(k=3)
+        # 4 samples, 2 classes. Class 0 at origin-ish, class 1 far away.
+        embeddings = np.array([
+            [1.0, 0.0, 0.0],
+            [0.99, 0.01, 0.0],
+            [0.0, 0.0, 1.0],
+            [0.01, 0.0, 0.99],
+        ])
+        labels = np.array([0, 0, 1, 1])
+        result = metric.compute(embeddings, labels)
+        assert result["knn_top1_accuracy"] == 100.0
+        assert result["knn_top5_accuracy"] == 100.0
+
+    def test_random_embeddings_returns_valid_range(self):
+        """Random embeddings should still return accuracy in [0, 100]."""
+        rng = np.random.RandomState(42)
+        metric = KNNAccuracyMetric(k=5)
+        embeddings = rng.randn(50, 32)
+        labels = rng.randint(0, 5, size=50)
+        result = metric.compute(embeddings, labels)
+        assert 0.0 <= result["knn_top1_accuracy"] <= 100.0
+        assert 0.0 <= result["knn_top5_accuracy"] <= 100.0
+
+    def test_k_capped_to_n_minus_1(self):
+        """k should be capped when larger than N-1."""
+        metric = KNNAccuracyMetric(k=100)
+        embeddings = np.array([
+            [1.0, 0.0],
+            [0.9, 0.1],
+            [0.0, 1.0],
+        ])
+        labels = np.array([0, 0, 1])
+        # Should not raise, k capped to 2
+        result = metric.compute(embeddings, labels)
+        assert "knn_top1_accuracy" in result
+
+    def test_too_few_samples_raises(self):
+        metric = KNNAccuracyMetric(k=5)
+        with pytest.raises(ValueError, match="At least 2 samples"):
+            metric.compute(np.array([[1.0]]), np.array([0]))
+
+    def test_k_less_than_1_raises(self):
+        metric = KNNAccuracyMetric(k=0)
+        with pytest.raises(ValueError, match="k must be >= 1"):
+            metric.compute(np.array([[1.0], [2.0]]), np.array([0, 1]))
+
+    def test_returns_float(self):
+        metric = KNNAccuracyMetric(k=2)
+        embeddings = np.array([[1.0, 0.0], [0.9, 0.1], [0.0, 1.0], [0.1, 0.9]])
+        labels = np.array([0, 0, 1, 1])
+        result = metric.compute(embeddings, labels)
+        assert isinstance(result["knn_top1_accuracy"], float)
+        assert isinstance(result["knn_top5_accuracy"], float)
+
+    def test_two_samples_minimal(self):
+        """Smallest valid case: two samples."""
+        metric = KNNAccuracyMetric(k=1)
+        embeddings = np.array([[1.0, 0.0], [0.0, 1.0]])
+        labels = np.array([0, 1])
+        result = metric.compute(embeddings, labels)
+        # With only 1 neighbor each, both predict the other's label
+        assert result["knn_top1_accuracy"] == 0.0
+
+
+# ---------------------------------------------------------------------------
+# WinMLImageFeatureExtractionEvaluator
+# ---------------------------------------------------------------------------
+
+class TestImageFeatureExtractionEvaluatorSchema:
+    def test_schema_has_image_and_label(self):
+        schema = WinMLImageFeatureExtractionEvaluator.schema_info()
+        names = [col.name for col in schema]
+        assert "image" in names
+        assert "label" in names
+
+    def test_schema_column_types(self):
+        schema = WinMLImageFeatureExtractionEvaluator.schema_info()
+        type_map = {col.name: col.type for col in schema}
+        assert type_map["image"] == "Image"
+        assert type_map["label"] == "ClassLabel"
+
+
+class TestImageFeatureExtractionEvaluatorInit:
+    def test_default_label_column(self):
+        evaluator = make_evaluator()
+        assert evaluator._label_col == "label"
+
+    def test_custom_label_column(self):
+        evaluator = make_evaluator(columns_mapping={"label_column": "category"})
+        assert evaluator._label_col == "category"
+
+
+class TestImageFeatureExtractionEvaluatorAlignLabels:
+    def test_align_labels_is_noop(self):
+        evaluator = make_evaluator()
+        mock_dataset = MagicMock()
+        mock_ds_config = MagicMock()
+        result = evaluator.align_labels(mock_dataset, mock_ds_config)
+        assert result is mock_dataset
+
+
+class TestExtractImageEmbedding:
+    """Tests for `_extract_image_embedding` across supported output shapes."""
+
+    def test_tokens_3d_returns_cls(self):
+        # [1, num_tokens, hidden] — ViT/DINOv2 default (pool=False).
+        raw = np.arange(1 * 4 * 8, dtype=np.float32).reshape(1, 4, 8)
+        out = WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw)
+        assert out.shape == (8,)
+        np.testing.assert_array_equal(out, raw[0, 0])
+
+    def test_pooled_2d_returns_vector(self):
+        # [1, hidden] — pooled / projected output.
+        raw = np.arange(16, dtype=np.float32).reshape(1, 16)
+        out = WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw)
+        assert out.shape == (16,)
+        np.testing.assert_array_equal(out, raw[0])
+
+    def test_nested_list_input_supported(self):
+        # HF pipeline typically returns nested Python lists.
+        raw = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]]  # [1, 2, 3]
+        out = WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw)
+        assert out.shape == (3,)
+        np.testing.assert_array_equal(out, np.array([1.0, 2.0, 3.0]))
+
+    def test_cnn_feature_map_raises(self):
+        # [1, C, H, W] — not supported, surface error instead of silent bad output.
+        raw = np.zeros((1, 8, 3, 3), dtype=np.float32)
+        with pytest.raises(ValueError, match="Unsupported"):
+            WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw)
+
+    def test_scalar_raises(self):
+        raw = [np.float32(1.0)]
+        with pytest.raises(ValueError, match="Unsupported"):
+            WinMLImageFeatureExtractionEvaluator._extract_image_embedding(raw)
+
+
+class TestImageFeatureExtractionEvaluatorRegistry:
+    def test_registered_in_evaluator_registry(self):
+        from winml.modelkit.eval.evaluate import _EVALUATOR_REGISTRY
+
+        assert "image-feature-extraction" in _EVALUATOR_REGISTRY
+        assert (
+            _EVALUATOR_REGISTRY["image-feature-extraction"]
+            is WinMLImageFeatureExtractionEvaluator
+        )
+
+    def test_default_dataset_registered(self):
+        from winml.modelkit.eval.evaluate import _DEFAULT_DATASETS
+
+        assert "image-feature-extraction" in _DEFAULT_DATASETS
+        ds = _DEFAULT_DATASETS["image-feature-extraction"]
+        assert ds.path == "timm/mini-imagenet"
+        assert ds.samples == 1000
+
+
+# ---------------------------------------------------------------------------
+# WinMLImageFeatureExtractionEvaluator.compute
+# ---------------------------------------------------------------------------
+
+class TestCompute:
+    """End-to-end: pipeline output -> CLS extraction -> kNN metric."""
+
+    @staticmethod
+    def _token_sequence(cls_vec: list[float], num_tokens: int = 3) -> list:
+        """Build a [1, num_tokens, hidden] pipeline output with given CLS vec."""
+        hidden = len(cls_vec)
+        # Non-CLS tokens are arbitrary — only index 0 should be used.
+        other = [[0.5] * hidden for _ in range(num_tokens - 1)]
+        return [[cls_vec, *other]]
+
+    def test_end_to_end_flow(self):
+        """Pipeline tokens -> CLS extraction -> kNN produces valid accuracies."""
+        ev = make_evaluator()
+
+        # Two well-separated clusters, two samples each.
+        cluster_a = [1.0, 0.0, 0.0]
+        cluster_b = [0.0, 1.0, 0.0]
+        ev.data = [
+            {"image": "img1", "label": 0},
+            {"image": "img2", "label": 0},
+            {"image": "img3", "label": 1},
+            {"image": "img4", "label": 1},
+        ]
+        outputs = iter([
+            self._token_sequence(cluster_a),
+            self._token_sequence([0.99, 0.01, 0.0]),
+            self._token_sequence(cluster_b),
+            self._token_sequence([0.01, 0.99, 0.0]),
+        ])
+        ev.pipe = MagicMock(side_effect=lambda _img: next(outputs))
+
+        result = ev.compute()
+
+        assert "knn_top1_accuracy" in result
+        assert "knn_top5_accuracy" in result
+        # Perfectly separable clusters -> 100% top-1.
+        assert result["knn_top1_accuracy"] == 100.0
+
+    def test_skips_samples_with_none_image_or_label(self):
+        """Samples missing image or label are dropped before embedding."""
+        ev = make_evaluator()
+
+        ev.data = [
+            {"image": "img1", "label": 0},
+            {"image": None, "label": 0},          # skipped
+            {"image": "img2", "label": None},     # skipped
+            {"image": "img3", "label": 1},
+        ]
+        outputs = iter([
+            self._token_sequence([1.0, 0.0]),
+            self._token_sequence([0.0, 1.0]),
+        ])
+        ev.pipe = MagicMock(side_effect=lambda _img: next(outputs))
+
+        result = ev.compute()
+
+        # Pipe should only be called for valid samples.
+        assert ev.pipe.call_count == 2
+        assert "knn_top1_accuracy" in result
+
+    def test_raises_when_fewer_than_two_valid_samples(self):
+        """ValueError is raised if <2 valid samples remain after filtering."""
+        ev = make_evaluator()
+
+        ev.data = [
+            {"image": "img1", "label": 0},
+            {"image": None, "label": 0},
+        ]
+        ev.pipe = MagicMock(
+            return_value=self._token_sequence([1.0, 0.0])
+        )
+
+        with pytest.raises(ValueError, match="at least 2 valid samples"):
+            ev.compute()
+