microsoft · jeon185 · Apr 30, 2026 · Apr 23, 2026 · Apr 29, 2026
@@ -1271,6 +1271,23 @@
       }
     }
   },
+  {
+    "hf_id": "cross-encoder/nli-deberta-v3-small",
+    "task": "zero-shot-classification",
+    "model_type": "deberta-v2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "nyu-mll/multi_nli",
+      "split": "validation_matched",
+      "metric": "accuracy",
+      "columns_mapping": {
+        "input_column": "premise",
+        "label_column": "genre",
+        "candidate_labels": "fiction,government,slate,telephone,travel"
+      }
+    }
+  },
   {
     "hf_id": "openai/clip-vit-base-patch32",
     "task": "zero-shot-image-classification",
@@ -1289,6 +1306,23 @@
       }
     }
   },
+  {
+    "hf_id": "joeddav/xlm-roberta-large-xnli",
+    "task": "zero-shot-classification",
+    "model_type": "xlm-roberta",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "nyu-mll/multi_nli",
+      "split": "validation_matched",
+      "metric": "accuracy",
+      "columns_mapping": {
+        "input_column": "premise",
+        "label_column": "genre",
+        "candidate_labels": "fiction,government,slate,telephone,travel"
+      }
+    }
+  },
   {
     "hf_id": "openai/clip-vit-large-patch14",
     "task": "zero-shot-image-classification",
@@ -1307,6 +1341,23 @@
       }
     }
   },
+  {
+    "hf_id": "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
+    "task": "zero-shot-classification",
+    "model_type": "distilbert",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "nyu-mll/multi_nli",
+      "split": "validation_matched",
+      "metric": "accuracy",
+      "columns_mapping": {
+        "input_column": "premise",
+        "label_column": "genre",
+        "candidate_labels": "fiction,government,slate,telephone,travel"
+      }
+    }
+  },
   {
     "hf_id": "openai/clip-vit-large-patch14-336",
     "task": "zero-shot-image-classification",
@@ -1325,6 +1376,23 @@
       }
     }
   },
+  {
+    "hf_id": "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli",
+    "task": "zero-shot-classification",
+    "model_type": "deberta-v2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "nyu-mll/multi_nli",
+      "split": "validation_matched",
+      "metric": "accuracy",
+      "columns_mapping": {
+        "input_column": "premise",
+        "label_column": "genre",
+        "candidate_labels": "fiction,government,slate,telephone,travel"
+      }
+    }
+  },
   {
     "hf_id": "openai/clip-vit-base-patch16",
     "task": "zero-shot-image-classification",
@@ -1343,6 +1411,23 @@
       }
     }
   },
+  {
+    "hf_id": "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
+    "task": "zero-shot-classification",
+    "model_type": "deberta-v2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "nyu-mll/multi_nli",
+      "split": "validation_matched",
+      "metric": "accuracy",
+      "columns_mapping": {
+        "input_column": "premise",
+        "label_column": "genre",
+        "candidate_labels": "fiction,government,slate,telephone,travel"
+      }
+    }
+  },
   {
     "hf_id": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
     "task": "zero-shot-image-classification",
@@ -1361,6 +1446,23 @@
       }
     }
   },
+  {
+    "hf_id": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
+    "task": "zero-shot-classification",
+    "model_type": "deberta-v2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "nyu-mll/multi_nli",
+      "split": "validation_matched",
+      "metric": "accuracy",
+      "columns_mapping": {
+        "input_column": "premise",
+        "label_column": "genre",
+        "candidate_labels": "fiction,government,slate,telephone,travel"
+      }
+    }
+  },
   {
     "hf_id": "patrickjohncyh/fashion-clip",
     "task": "zero-shot-image-classification",
@@ -1378,6 +1480,23 @@
       }
     }
   },
+  {
+    "hf_id": "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
+    "task": "zero-shot-classification",
+    "model_type": "deberta-v2",
+    "group": "Top200",
+    "priority": "P1",
+    "dataset_config": {
+      "path": "nyu-mll/multi_nli",
+      "split": "validation_matched",
+      "metric": "accuracy",
+      "columns_mapping": {
+        "input_column": "premise",
+        "label_column": "genre",
+        "candidate_labels": "fiction,government,slate,telephone,travel"
+      }
+    }
+  },
   {
     "hf_id": "google/siglip-so400m-patch14-384",
     "task": "zero-shot-image-classification",

@@ -45,6 +45,7 @@
     "sentence-similarity": TextDataset,
     "next-sentence-prediction": TextDataset,
     "fill-mask": TextDataset,
+    "zero-shot-classification": TextDataset,
     "image-segmentation": ImageSegmentationDataset,
     "random": RandomDataset,
     # Add more task types as needed

@@ -15,6 +15,7 @@
 from .fill_mask_evaluator import WinMLFillMaskEvaluator
 from .image_feature_extraction_evaluator import WinMLImageFeatureExtractionEvaluator
 from .image_segmentation_evaluator import WinMLImageSegmentationEvaluator
+from .metrics.classification import ClassificationMetric
 from .metrics.knn_accuracy import KNNAccuracyMetric
 from .metrics.mean_average_precision import MAPMetric
 from .metrics.mean_iou import IGNORE_INDEX, MeanIoUMetric
@@ -25,11 +26,13 @@
 from .question_answering_evaluator import WinMLQuestionAnsweringEvaluator
 from .text_classification_evaluator import WinMLTextClassificationEvaluator
 from .token_classification_evaluator import WinMLTokenClassificationEvaluator
+from .zero_shot_classification_evaluator import WinMLZeroShotClassificationEvaluator
 from .zero_shot_image_classification_evaluator import WinMLZeroShotImageClassificationEvaluator
 
 
 __all__ = [
     "IGNORE_INDEX",
+    "ClassificationMetric",
     "EvalResult",
     "KNNAccuracyMetric",
     "MAPMetric",
@@ -47,6 +50,7 @@
     "WinMLQuestionAnsweringEvaluator",
     "WinMLTextClassificationEvaluator",
     "WinMLTokenClassificationEvaluator",
+    "WinMLZeroShotClassificationEvaluator",
     "WinMLZeroShotImageClassificationEvaluator",
     "evaluate",
 ]
@@ -48,7 +48,7 @@
        self.model = model
        self.config = config
        self.data = self.prepare_data()
        self.pipe = self.prepare_pipeline()

    def compute(self) -> dict[str, Any]:
        """Run evaluation and return metrics."""
@@ -143,6 +143,38 @@
             device="cpu",
         )
 
+    def _fixed_seq_length(self) -> int | None:
+        """Return the model's fixed sequence length, or ``None`` if dynamic.
+
+        Reads ``io_config["input_shapes"]`` and treats an integer second
+        dimension as a static sequence length. Subclasses use this to decide
+        whether tokenized inputs need to be padded/truncated to a fixed size.
+        """
+        io_config = getattr(self.model, "io_config", None) or {}
+        shapes = io_config.get("input_shapes") or [[]]
+        if len(shapes[0]) > 1 and isinstance(shapes[0][1], int):
+            return shapes[0][1]
+        return None
+
+    def _pad_or_truncate(self, encoding: Any, tokenizer: Any) -> Any:
+        """Resize tokenized inputs to the model's fixed sequence length.
+
+        No-op for dynamic-shape models. Otherwise truncates over-length
+        tensors and delegates padding to the tokenizer.
+        """
+        seq_len = self._fixed_seq_length()
+        if seq_len is None:
+            return encoding
+        for key, tensor in list(encoding.items()):
+            if hasattr(tensor, "shape") and tensor.dim() >= 2 and tensor.shape[1] > seq_len:
+                encoding[key] = tensor[:, :seq_len]
+        return tokenizer.pad(
+            encoding,
+            padding="max_length",
+            max_length=seq_len,
+            return_tensors="pt",
+        )
+
     def align_labels(self, dataset: Dataset, ds_config: DatasetConfig) -> Dataset:
         """Align dataset labels and filter unsupported IDs.
 

@@ -23,6 +23,7 @@
 from .question_answering_evaluator import WinMLQuestionAnsweringEvaluator
 from .text_classification_evaluator import WinMLTextClassificationEvaluator
 from .token_classification_evaluator import WinMLTokenClassificationEvaluator
+from .zero_shot_classification_evaluator import WinMLZeroShotClassificationEvaluator
 from .zero_shot_image_classification_evaluator import WinMLZeroShotImageClassificationEvaluator
 
 
@@ -43,6 +44,7 @@
     "sentence-similarity": WinMLFeatureExtractionEvaluator,
     "image-feature-extraction": WinMLImageFeatureExtractionEvaluator,
     "fill-mask": WinMLFillMaskEvaluator,
+    "zero-shot-classification": WinMLZeroShotClassificationEvaluator,
     "zero-shot-image-classification": WinMLZeroShotImageClassificationEvaluator,
 }
 
@@ -127,6 +129,16 @@
         streaming=True,
         columns_mapping={"input_column": "text"},
     ),
+    "zero-shot-classification": DatasetConfig(
+        path="fancyzhx/ag_news",
+        split="test",
+        samples=100,
+        shuffle=True,
+        columns_mapping={
+            "input_column": "text",
+            "label_column": "label",
+        },
+    ),
     "zero-shot-image-classification": DatasetConfig(
         path="uoft-cs/cifar100",
         split="test",

@@ -5,6 +5,7 @@
 
 """Evaluation metrics."""
 
+from .classification import ClassificationMetric
 from .knn_accuracy import KNNAccuracyMetric
 from .mean_average_precision import MAPMetric
 from .mean_iou import IGNORE_INDEX, MeanIoUMetric
@@ -15,6 +16,7 @@
 
 __all__ = [
     "IGNORE_INDEX",
+    "ClassificationMetric",
     "KNNAccuracyMetric",
     "MAPMetric",
     "MeanIoUMetric",

@@ -0,0 +1,56 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""Classification metrics.
+
+Accuracy and macro-F1 over string labels, for classification evaluators
+that do not have an HF evaluate wrapper (e.g. zero-shot-classification).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+class ClassificationMetric:
+    """Accuracy and macro-F1 over string labels."""
+
+    def compute(
+        self,
+        predictions: list[str],
+        references: list[str],
+        labels: list[str],
+    ) -> dict[str, Any]:
+        """Compute accuracy and macro-F1.
+
+        Args:
+            predictions: Predicted label strings, one per sample.
+            references: Ground-truth label strings, one per sample.
+            labels: Full set of class labels for macro-F1 averaging.
+
+        Returns:
+            Dict with ``accuracy`` and ``f1`` (both floats in [0, 1]).
+        """
+        from sklearn.metrics import accuracy_score, f1_score
+
+        if len(predictions) != len(references):
+            raise ValueError(
+                f"predictions and references must have the same length, "
+                f"got {len(predictions)} vs {len(references)}.",
+            )
+        if not references:
+            raise ValueError("references must not be empty.")
+        if not labels:
+            raise ValueError("labels must not be empty.")
+
+        accuracy = accuracy_score(references, predictions)
+        macro_f1 = f1_score(
+            references,
+            predictions,
+            labels=labels,
+            average="macro",
+            zero_division=0,
+        )
+        return {"accuracy": float(accuracy), "f1": float(macro_f1)}