Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions scripts/e2e_eval/cache/baseline_cache.json
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,86 @@
"elapsed": 449.7,
"command": "python.exe run_pytorch_baseline.py --model swin-large-patch4-window7-224 --task image-classification --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
},
"facebook/dinov2-small|image-feature-extraction|timm/mini-imagenet||test|1000": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 86.2,
"num_samples": 1000
},
"elapsed": 141.5,
"command": "python.exe run_pytorch_baseline.py --model dinov2-small --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
},
"facebook/dinov2-base|image-feature-extraction|timm/mini-imagenet||test|1000": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 89.5,
"num_samples": 1000
},
"elapsed": 316.5,
"command": "python.exe run_pytorch_baseline.py --model dinov2-base --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
},
"facebook/dino-vits16|image-feature-extraction|timm/mini-imagenet||test|1000": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 79.7,
"num_samples": 1000
},
"elapsed": 94.2,
"command": "python.exe run_pytorch_baseline.py --model dino-vits16 --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
},
"facebook/dino-vitb16|image-feature-extraction|timm/mini-imagenet||test|1000": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 83.2,
"num_samples": 1000
},
"elapsed": 248.6,
"command": "python.exe run_pytorch_baseline.py --model dino-vitb16 --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
},
"google/vit-base-patch16-224-in21k|image-feature-extraction|timm/mini-imagenet||test|1000": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 91.9,
"num_samples": 1000
},
"elapsed": 231.2,
"command": "python.exe run_pytorch_baseline.py --model vit-base-patch16-224-in21k --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
},
"StanfordAIMI/dinov2-base-xray-224|image-feature-extraction|Ewakaa/pneumonia_classification_chest_xray||test|582": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 93.6426,
"num_samples": 582
},
"elapsed": 166.0,
"command": "python.exe run_pytorch_baseline.py --model dinov2-base-xray-224 --task image-feature-extraction --device cpu --num-samples 582 --dataset pneumonia_classification_chest_xray --split test"
},
"facebook/dinov2-large|image-feature-extraction|timm/mini-imagenet||test|1000": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 91.1,
"num_samples": 1000
},
"elapsed": 890.6,
"command": "python.exe run_pytorch_baseline.py --model dinov2-large --task image-feature-extraction --device cpu --num-samples 1000 --dataset mini-imagenet --split test"
},
"microsoft/rad-dino|image-feature-extraction|Ewakaa/pneumonia_classification_chest_xray||test|582": {
"status": "PASS",
"metric": {
"metric": "knn_top1_accuracy",
"value": 94.6735,
"num_samples": 582
},
"elapsed": 894.7,
"command": "python.exe run_pytorch_baseline.py --model rad-dino --task image-feature-extraction --device cpu --num-samples 582 --dataset pneumonia_classification_chest_xray --split test"
},
"google-bert/bert-base-uncased|fill-mask|Salesforce/wikitext|wikitext-2-raw-v1|test|100": {
"status": "PASS",
"metric": {
Expand Down
3 changes: 2 additions & 1 deletion scripts/e2e_eval/run_pytorch_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def _emit_result(metric: str, value: float, num_samples: int) -> None:
"image-segmentation": "mean_iou",
"feature-extraction": "cosine_spearman",
"sentence-similarity": "cosine_spearman",
"image-feature-extraction": "knn_top1_accuracy",
"fill-mask": "pseudo_perplexity",
}

Expand All @@ -75,8 +76,8 @@ def _emit_result(metric: str, value: float, num_samples: int) -> None:
def _load_pytorch_model(model_id: str, task: str, device_str: str):
"""Load a native PyTorch model with the task-appropriate AutoModel class."""
import torch
from transformers import AutoConfig

from transformers import AutoConfig
from winml.modelkit.loader.task import resolve_task_and_model_class

config = AutoConfig.from_pretrained(model_id)
Expand Down
112 changes: 112 additions & 0 deletions scripts/e2e_eval/testsets/models_with_acc.json
Original file line number Diff line number Diff line change
Expand Up @@ -1015,6 +1015,118 @@
}
}
},
{
"hf_id": "facebook/dinov2-small",
"task": "image-feature-extraction",
"model_type": "dinov2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "timm/mini-imagenet",
"split": "test",
"samples": 1000,
"metric": "knn_top1_accuracy",
Comment thread
zhenchaoni marked this conversation as resolved.
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "facebook/dinov2-base",
"task": "image-feature-extraction",
"model_type": "dinov2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "timm/mini-imagenet",
"split": "test",
"samples": 1000,
"metric": "knn_top1_accuracy",
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "facebook/dinov2-large",
"task": "image-feature-extraction",
"model_type": "dinov2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "timm/mini-imagenet",
"split": "test",
"samples": 1000,
"metric": "knn_top1_accuracy",
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "facebook/dino-vits16",
"task": "image-feature-extraction",
"model_type": "vit",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "timm/mini-imagenet",
"split": "test",
"samples": 1000,
"metric": "knn_top1_accuracy",
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "facebook/dino-vitb16",
"task": "image-feature-extraction",
"model_type": "vit",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "timm/mini-imagenet",
"split": "test",
"samples": 1000,
"metric": "knn_top1_accuracy",
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "google/vit-base-patch16-224-in21k",
"task": "image-feature-extraction",
"model_type": "vit",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "timm/mini-imagenet",
"split": "test",
"samples": 1000,
"metric": "knn_top1_accuracy",
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "microsoft/rad-dino",
"task": "image-feature-extraction",
"model_type": "dinov2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "Ewakaa/pneumonia_classification_chest_xray",
"split": "test",
"samples": 582,
"metric": "knn_top1_accuracy",
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "StanfordAIMI/dinov2-base-xray-224",
"task": "image-feature-extraction",
"model_type": "dinov2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "Ewakaa/pneumonia_classification_chest_xray",
"split": "test",
"samples": 582,
"metric": "knn_top1_accuracy",
"winml_metric_key": "knn_top1_accuracy"
}
},
{
"hf_id": "google-bert/bert-base-uncased",
"task": "fill-mask",
Expand Down
2 changes: 2 additions & 0 deletions scripts/e2e_eval/utils/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class AccuracyVerdict(str, Enum):
# False = smaller value is better (WER, loss)
METRIC_COMPARE_STRATEGY: dict[str, tuple[str, float, float, bool]] = {
"cosine_spearman": ("delta_absolute", 2.0, 4.0, True),
# WinML-vs-baseline delta is small — pick a tighter threshold than default.
"knn_top1_accuracy": ("delta_relative", 0.02, 0.05, True),
"pseudo_perplexity": ("delta_relative", 0.05, 0.10, False),
"default": ("delta_relative", 0.05, 0.10, True), # 5% and 10%
}
Expand Down
4 changes: 4 additions & 0 deletions src/winml/modelkit/eval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from .evaluate import EvalResult, evaluate
from .feature_extraction_evaluator import WinMLFeatureExtractionEvaluator
from .fill_mask_evaluator import WinMLFillMaskEvaluator
from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
from .metrics.knn_accuracy import KNNAccuracyMetric
from .metrics.mean_average_precision import MAPMetric
from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric
from .metrics.pseudo_perplexity import PseudoPerplexityMetric
Expand All @@ -27,6 +29,7 @@
__all__ = [
"IGNORE_INDEX",
"EvalResult",
"KNNAccuracyMetric",
"MAPMetric",
"MeanIoUMetric",
"PseudoPerplexityMetric",
Expand All @@ -35,6 +38,7 @@
"WinMLEvaluator",
"WinMLFeatureExtractionEvaluator",
"WinMLFillMaskEvaluator",
"WinMLImageFeatureExtractionEvaluator",
"WinMLImageSegmentationEvaluator",
"WinMLObjectDetectionEvaluator",
"WinMLQuestionAnsweringEvaluator",
Expand Down
8 changes: 8 additions & 0 deletions src/winml/modelkit/eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .config import WinMLEvaluationConfig
from .feature_extraction_evaluator import WinMLFeatureExtractionEvaluator
from .fill_mask_evaluator import WinMLFillMaskEvaluator
from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
from .object_detection_evaluator import WinMLObjectDetectionEvaluator
from .question_answering_evaluator import WinMLQuestionAnsweringEvaluator
Expand All @@ -40,6 +41,7 @@
"question-answering": WinMLQuestionAnsweringEvaluator,
"feature-extraction": WinMLFeatureExtractionEvaluator,
"sentence-similarity": WinMLFeatureExtractionEvaluator,
"image-feature-extraction": WinMLImageFeatureExtractionEvaluator,
"fill-mask": WinMLFillMaskEvaluator,
}

Expand Down Expand Up @@ -109,6 +111,12 @@
),
"feature-extraction": _FE_DEFAULT,
"sentence-similarity": _FE_DEFAULT,
"image-feature-extraction": DatasetConfig(
path="timm/mini-imagenet",
split="test",
samples=1000,
shuffle=True,
),
"fill-mask": DatasetConfig(
path="Salesforce/wikitext",
name="wikitext-2-raw-v1",
Expand Down
Loading
Loading