Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions scripts/e2e_eval/testsets/models_with_acc.json
Original file line number Diff line number Diff line change
Expand Up @@ -1271,6 +1271,23 @@
}
}
},
{
"hf_id": "cross-encoder/nli-deberta-v3-small",
"task": "zero-shot-classification",
"model_type": "deberta-v2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "nyu-mll/multi_nli",
"split": "validation_matched",
"metric": "accuracy",
"columns_mapping": {
"input_column": "premise",
"label_column": "genre",
"candidate_labels": "fiction,government,slate,telephone,travel"
}
}
},
{
"hf_id": "openai/clip-vit-base-patch32",
"task": "zero-shot-image-classification",
Expand All @@ -1289,6 +1306,23 @@
}
}
},
{
"hf_id": "joeddav/xlm-roberta-large-xnli",
"task": "zero-shot-classification",
"model_type": "xlm-roberta",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "nyu-mll/multi_nli",
"split": "validation_matched",
"metric": "accuracy",
"columns_mapping": {
"input_column": "premise",
"label_column": "genre",
"candidate_labels": "fiction,government,slate,telephone,travel"
}
}
},
{
"hf_id": "openai/clip-vit-large-patch14",
"task": "zero-shot-image-classification",
Expand All @@ -1307,6 +1341,23 @@
}
}
},
{
"hf_id": "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
"task": "zero-shot-classification",
"model_type": "distilbert",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "nyu-mll/multi_nli",
"split": "validation_matched",
"metric": "accuracy",
"columns_mapping": {
"input_column": "premise",
"label_column": "genre",
"candidate_labels": "fiction,government,slate,telephone,travel"
}
}
},
{
"hf_id": "openai/clip-vit-large-patch14-336",
"task": "zero-shot-image-classification",
Expand All @@ -1325,6 +1376,23 @@
}
}
},
{
"hf_id": "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
"task": "zero-shot-classification",
"model_type": "deberta-v2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "nyu-mll/multi_nli",
"split": "validation_matched",
"metric": "accuracy",
"columns_mapping": {
"input_column": "premise",
"label_column": "genre",
"candidate_labels": "fiction,government,slate,telephone,travel"
}
}
},
{
"hf_id": "openai/clip-vit-base-patch16",
"task": "zero-shot-image-classification",
Expand All @@ -1343,6 +1411,23 @@
}
}
},
{
"hf_id": "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
"task": "zero-shot-classification",
"model_type": "deberta-v2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "nyu-mll/multi_nli",
"split": "validation_matched",
"metric": "accuracy",
"columns_mapping": {
"input_column": "premise",
"label_column": "genre",
"candidate_labels": "fiction,government,slate,telephone,travel"
}
}
},
{
"hf_id": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
"task": "zero-shot-image-classification",
Expand All @@ -1361,6 +1446,23 @@
}
}
},
{
"hf_id": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
"task": "zero-shot-classification",
"model_type": "deberta-v2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "nyu-mll/multi_nli",
"split": "validation_matched",
"metric": "accuracy",
"columns_mapping": {
"input_column": "premise",
"label_column": "genre",
"candidate_labels": "fiction,government,slate,telephone,travel"
}
}
},
{
"hf_id": "patrickjohncyh/fashion-clip",
"task": "zero-shot-image-classification",
Expand All @@ -1378,6 +1480,23 @@
}
}
},
{
"hf_id": "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
"task": "zero-shot-classification",
"model_type": "deberta-v2",
"group": "Top200",
"priority": "P1",
"dataset_config": {
"path": "nyu-mll/multi_nli",
"split": "validation_matched",
"metric": "accuracy",
"columns_mapping": {
"input_column": "premise",
"label_column": "genre",
"candidate_labels": "fiction,government,slate,telephone,travel"
}
}
},
{
"hf_id": "google/siglip-so400m-patch14-384",
"task": "zero-shot-image-classification",
Expand Down
1 change: 1 addition & 0 deletions src/winml/modelkit/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"sentence-similarity": TextDataset,
"next-sentence-prediction": TextDataset,
"fill-mask": TextDataset,
"zero-shot-classification": TextDataset,
"image-segmentation": ImageSegmentationDataset,
"random": RandomDataset,
# Add more task types as needed
Expand Down
4 changes: 4 additions & 0 deletions src/winml/modelkit/eval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .fill_mask_evaluator import WinMLFillMaskEvaluator
from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
from .metrics.classification import ClassificationMetric
from .metrics.knn_accuracy import KNNAccuracyMetric
from .metrics.mean_average_precision import MAPMetric
from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric
Expand All @@ -25,11 +26,13 @@
from .question_answering_evaluator import WinMLQuestionAnsweringEvaluator
from .text_classification_evaluator import WinMLTextClassificationEvaluator
from .token_classification_evaluator import WinMLTokenClassificationEvaluator
from .zero_shot_classification_evaluator import WinMLZeroShotClassificationEvaluator
from .zero_shot_image_classification_evaluator import WinMLZeroShotImageClassificationEvaluator


__all__ = [
"IGNORE_INDEX",
"ClassificationMetric",
"EvalResult",
"KNNAccuracyMetric",
"MAPMetric",
Expand All @@ -47,6 +50,7 @@
"WinMLQuestionAnsweringEvaluator",
"WinMLTextClassificationEvaluator",
"WinMLTokenClassificationEvaluator",
"WinMLZeroShotClassificationEvaluator",
"WinMLZeroShotImageClassificationEvaluator",
"evaluate",
]
32 changes: 32 additions & 0 deletions src/winml/modelkit/eval/base_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
self.model = model
self.config = config
self.data = self.prepare_data()
self.pipe = self.prepare_pipeline()

Check warning

Code scanning / CodeQL

`__init__` method calls overridden method Warning

This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLDepthEstimationEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLFeatureExtractionEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLImageFeatureExtractionEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLImageSegmentationEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLKeypointDetectionEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLObjectDetectionEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLQuestionAnsweringEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by
WinMLTextClassificationEvaluator.prepare_pipeline
.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by WinMLTokenClassificationEvaluator.prepare_pipeline.
This call to
WinMLEvaluator.prepare_pipeline
in an initialization method is overridden by WinMLZeroShotClassificationEvaluator.prepare_pipeline.

def compute(self) -> dict[str, Any]:
"""Run evaluation and return metrics."""
Expand Down Expand Up @@ -143,6 +143,38 @@
device="cpu",
)

def _fixed_seq_length(self) -> int | None:
"""Return the model's fixed sequence length, or ``None`` if dynamic.

Reads ``io_config["input_shapes"]`` and treats an integer second
dimension as a static sequence length. Subclasses use this to decide
whether tokenized inputs need to be padded/truncated to a fixed size.
"""
io_config = getattr(self.model, "io_config", None) or {}
shapes = io_config.get("input_shapes") or [[]]
if len(shapes[0]) > 1 and isinstance(shapes[0][1], int):
return shapes[0][1]
return None

def _pad_or_truncate(self, encoding: Any, tokenizer: Any) -> Any:
"""Resize tokenized inputs to the model's fixed sequence length.

No-op for dynamic-shape models. Otherwise truncates over-length
tensors and delegates padding to the tokenizer.
"""
seq_len = self._fixed_seq_length()
if seq_len is None:
return encoding
for key, tensor in list(encoding.items()):
if hasattr(tensor, "shape") and tensor.dim() >= 2 and tensor.shape[1] > seq_len:
encoding[key] = tensor[:, :seq_len]
return tokenizer.pad(
encoding,
padding="max_length",
max_length=seq_len,
return_tensors="pt",
)

def align_labels(self, dataset: Dataset, ds_config: DatasetConfig) -> Dataset:
"""Align dataset labels and filter unsupported IDs.

Expand Down
12 changes: 12 additions & 0 deletions src/winml/modelkit/eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from .question_answering_evaluator import WinMLQuestionAnsweringEvaluator
from .text_classification_evaluator import WinMLTextClassificationEvaluator
from .token_classification_evaluator import WinMLTokenClassificationEvaluator
from .zero_shot_classification_evaluator import WinMLZeroShotClassificationEvaluator
from .zero_shot_image_classification_evaluator import WinMLZeroShotImageClassificationEvaluator


Expand All @@ -43,6 +44,7 @@
"sentence-similarity": WinMLFeatureExtractionEvaluator,
"image-feature-extraction": WinMLImageFeatureExtractionEvaluator,
"fill-mask": WinMLFillMaskEvaluator,
"zero-shot-classification": WinMLZeroShotClassificationEvaluator,
"zero-shot-image-classification": WinMLZeroShotImageClassificationEvaluator,
}

Expand Down Expand Up @@ -127,6 +129,16 @@
streaming=True,
columns_mapping={"input_column": "text"},
),
"zero-shot-classification": DatasetConfig(
path="fancyzhx/ag_news",
split="test",
samples=100,
shuffle=True,
columns_mapping={
"input_column": "text",
"label_column": "label",
},
),
"zero-shot-image-classification": DatasetConfig(
path="uoft-cs/cifar100",
split="test",
Expand Down
2 changes: 2 additions & 0 deletions src/winml/modelkit/eval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

"""Evaluation metrics."""

from .classification import ClassificationMetric
from .knn_accuracy import KNNAccuracyMetric
from .mean_average_precision import MAPMetric
from .mean_iou import IGNORE_INDEX, MeanIoUMetric
Expand All @@ -15,6 +16,7 @@

__all__ = [
"IGNORE_INDEX",
"ClassificationMetric",
"KNNAccuracyMetric",
"MAPMetric",
"MeanIoUMetric",
Expand Down
56 changes: 56 additions & 0 deletions src/winml/modelkit/eval/metrics/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------

"""Classification metrics.

Accuracy and macro-F1 over string labels, for classification evaluators
that do not have an HF evaluate wrapper (e.g. zero-shot-classification).
"""

from __future__ import annotations

from typing import Any


class ClassificationMetric:
"""Accuracy and macro-F1 over string labels."""

def compute(
self,
predictions: list[str],
references: list[str],
labels: list[str],
) -> dict[str, Any]:
"""Compute accuracy and macro-F1.

Args:
predictions: Predicted label strings, one per sample.
references: Ground-truth label strings, one per sample.
labels: Full set of class labels for macro-F1 averaging.

Returns:
Dict with ``accuracy`` and ``f1`` (both floats in [0, 1]).
"""
from sklearn.metrics import accuracy_score, f1_score

if len(predictions) != len(references):
raise ValueError(
f"predictions and references must have the same length, "
f"got {len(predictions)} vs {len(references)}.",
)
if not references:
raise ValueError("references must not be empty.")
if not labels:
raise ValueError("labels must not be empty.")

accuracy = accuracy_score(references, predictions)
macro_f1 = f1_score(
references,
predictions,
labels=labels,
average="macro",
zero_division=0,
)
return {"accuracy": float(accuracy), "f1": float(macro_f1)}
Loading
Loading