diff --git a/nucleus/metrics/base.py b/nucleus/metrics/base.py
index 75d916e9..587dd5a5 100644
--- a/nucleus/metrics/base.py
+++ b/nucleus/metrics/base.py
@@ -1,7 +1,7 @@
 import sys
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Iterable, List
+from typing import Dict, Iterable, List
 
 from nucleus.annotation import AnnotationList
 from nucleus.prediction import PredictionList
@@ -10,6 +10,16 @@
 class MetricResult(ABC):
     """Base MetricResult class"""
 
+    @property
+    @abstractmethod
+    def results(self) -> Dict[str, float]:
+        """Interface for item results"""
+
+    @property
+    def extra_info(self) -> Dict[str, str]:
+        """Overload this to pass extra info about the item to show in the UI"""
+        return {}
+
 
 @dataclass
 class ScalarResult(MetricResult):
@@ -27,6 +37,14 @@ class ScalarResult(MetricResult):
     value: float
     weight: float = 1.0
 
+    @property
+    def results(self) -> Dict[str, float]:
+        return {"value": self.value}
+
+    @property
+    def extra_info(self) -> Dict[str, str]:
+        return {"weight:": str(self.weight)}
+
     @staticmethod
     def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult":
         """Aggregates results using a weighted average."""
@@ -37,6 +55,22 @@ def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult":
         return ScalarResult(value, total_weight)
 
 
+@dataclass
+class GroupedScalarResult(MetricResult):
+    group_to_scalar: Dict[str, ScalarResult]
+
+    @property
+    def results(self) -> Dict[str, float]:
+        group_results = {
+            group: scalar.value
+            for group, scalar in self.group_to_scalar.items()
+        }
+        group_results["all_groups"] = ScalarResult.aggregate(
+            self.group_to_scalar.values()
+        ).value
+        return group_results
+
+
 class Metric(ABC):
     """Abstract class for defining a metric, which takes a list of annotations
     and predictions and returns a scalar.
@@ -93,7 +127,9 @@ def __call__(
         """A metric must override this method and return a metric result, given annotations and predictions."""
 
     @abstractmethod
-    def aggregate_score(self, results: List[MetricResult]) -> ScalarResult:
+    def aggregate_score(
+        self, results: List[MetricResult]
+    ) -> Dict[str, ScalarResult]:
         """A metric must define how to aggregate results from single items to a single ScalarResult.
 
         E.g. to calculate a R2 score with sklearn you could define a custom metric class ::
diff --git a/nucleus/metrics/categorization_metrics.py b/nucleus/metrics/categorization_metrics.py
index 416f831a..0d9f01cf 100644
--- a/nucleus/metrics/categorization_metrics.py
+++ b/nucleus/metrics/categorization_metrics.py
@@ -1,6 +1,6 @@
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import List, Set, Tuple, Union
+from typing import Dict, List, Set, Tuple, Union
 
 from sklearn.metrics import f1_score
 
@@ -33,16 +33,28 @@ class CategorizationResult(MetricResult):
     predictions: List[CategoryPrediction]
 
     @property
-    def value(self):
+    def results(self) -> Dict[str, float]:
         annotation_labels = to_taxonomy_labels(self.annotations)
         prediction_labels = to_taxonomy_labels(self.predictions)
 
         # TODO: Change task.py interface such that we can return label matching
-        # NOTE: Returning 1 if all taxonomy labels match else 0
-        value = f1_score(
-            list(annotation_labels), list(prediction_labels), average="macro"
-        )
-        return value
+        results = {
+            "f1_macro": f1_score(
+                list(annotation_labels),
+                list(prediction_labels),
+                average="macro",
+            )
+        }
+        return results
+
+    @property
+    def extra_info(self) -> Dict[str, str]:
+        annotation_labels = to_taxonomy_labels(self.annotations)
+        prediction_labels = to_taxonomy_labels(self.predictions)
+        return {
+            "annotations": ", ".join(annotation_labels),
+            "predictions": ", ".join(prediction_labels),
+        }
 
 
 class CategorizationMetric(Metric):
@@ -80,7 +92,7 @@ def eval(
         pass
 
     @abstractmethod
-    def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult:  # type: ignore[override]
+    def aggregate_score(self, results: List[CategorizationResult]) -> Dict[str, ScalarResult]:  # type: ignore[override]
         pass
 
     def __call__(
@@ -189,11 +201,18 @@ def eval(
             annotations=annotations, predictions=predictions
         )
 
-    def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult:  # type: ignore[override]
+    def aggregate_score(self, results: List[CategorizationResult]) -> Dict[str, ScalarResult]:  # type: ignore[override]
         gt = []
         predicted = []
         for result in results:
             gt.extend(list(to_taxonomy_labels(result.annotations)))
             predicted.extend(list(to_taxonomy_labels(result.predictions)))
-        value = f1_score(gt, predicted, average=self.f1_method)
-        return ScalarResult(value)
+        aggregate_scores = {}
+        aggregate_scores["macro"] = f1_score(gt, predicted, average="macro")
+        aggregate_scores["weighted"] = f1_score(
+            gt, predicted, average="weighted"
+        )
+        return {
+            result_label: ScalarResult(val)
+            for result_label, val in aggregate_scores.items()
+        }
diff --git a/nucleus/metrics/label_grouper.py b/nucleus/metrics/label_grouper.py
new file mode 100644
index 00000000..4a562637
--- /dev/null
+++ b/nucleus/metrics/label_grouper.py
@@ -0,0 +1,43 @@
+from typing import Any, List
+
+import numpy as np
+import pandas as pd
+
+
+class LabelsGrouper:
+    def __init__(self, annotations_or_predictions_list: List[Any]):
+        self.items = annotations_or_predictions_list
+        if len(self.items) > 0:
+            assert hasattr(
+                self.items[0], "label"
+            ), f"Expected items to have attribute 'label' found none on {repr(self.items[0])}"
+        self.codes, self.labels = pd.factorize(
+            [item.label for item in self.items]
+        )
+        self.group_idx = 0
+
+    def __iter__(self):
+        self.group_idx = 0
+        return self
+
+    def __next__(self):
+        if self.group_idx >= len(self.labels):
+            raise StopIteration
+        label = self.labels[self.group_idx]
+        label_items = list(
+            np.take(self.items, np.where(self.codes == self.group_idx)[0])
+        )
+        self.group_idx += 1
+        return label, label_items
+
+    def label_group(self, label: str) -> List[Any]:
+        if len(self.items) == 0:
+            return []
+        idx = np.where(self.labels == label)[0]
+        if idx >= 0:
+            label_items = list(
+                np.take(self.items, np.where(self.codes == idx)[0])
+            )
+            return label_items
+        else:
+            return []
diff --git a/nucleus/metrics/polygon_metrics.py b/nucleus/metrics/polygon_metrics.py
index 7ebbf20d..eff482f0 100644
--- a/nucleus/metrics/polygon_metrics.py
+++ b/nucleus/metrics/polygon_metrics.py
@@ -1,14 +1,16 @@
 import sys
 from abc import abstractmethod
-from typing import List, Union
+from collections import defaultdict
+from typing import Dict, List, Union
 
 import numpy as np
 
 from nucleus.annotation import AnnotationList, BoxAnnotation, PolygonAnnotation
 from nucleus.prediction import BoxPrediction, PolygonPrediction, PredictionList
 
-from .base import Metric, ScalarResult
+from .base import GroupedScalarResult, Metric, ScalarResult
 from .filters import confidence_filter, polygon_label_filter
+from .label_grouper import LabelsGrouper
 from .metric_utils import compute_average_precision
 from .polygon_utils import (
     BoxOrPolygonAnnotation,
@@ -80,19 +82,44 @@ def eval(
 
     def __init__(
         self,
-        enforce_label_match: bool = False,
+        enforce_label_match: bool = True,
         confidence_threshold: float = 0.0,
     ):
         """Initializes PolygonMetric abstract object.
 
         Args:
-            enforce_label_match: whether to enforce that annotation and prediction labels must match. Default False
+            enforce_label_match: whether to enforce that annotation and prediction labels must match. Default True
             confidence_threshold: minimum confidence threshold for predictions. Must be in [0, 1]. Default 0.0
         """
         self.enforce_label_match = enforce_label_match
         assert 0 <= confidence_threshold <= 1
         self.confidence_threshold = confidence_threshold
 
+    def eval_grouped(
+        self,
+        annotations: List[Union[BoxAnnotation, PolygonAnnotation]],
+        predictions: List[Union[BoxPrediction, PolygonPrediction]],
+    ) -> GroupedScalarResult:
+        grouped_annotations = LabelsGrouper(annotations)
+        grouped_predictions = LabelsGrouper(predictions)
+        results = {}
+        for label, label_annotations in grouped_annotations:
+            # TODO(gunnar): Enforce label match -> Why is that a parameter? Should we generally allow IOU matches
+            #  between different labels?!?
+            match_predictions = (
+                grouped_predictions.label_group(label)
+                if self.enforce_label_match
+                else predictions
+            )
+            eval_fn = label_match_wrapper(self.eval)
+            result = eval_fn(
+                label_annotations,
+                match_predictions,
+                enforce_label_match=self.enforce_label_match,
+            )
+            results[label] = result
+        return GroupedScalarResult(group_to_scalar=results)
+
     @abstractmethod
     def eval(
         self,
@@ -102,12 +129,20 @@ def eval(
         # Main evaluation function that subclasses must override.
         pass
 
-    def aggregate_score(self, results: List[ScalarResult]) -> ScalarResult:  # type: ignore[override]
-        return ScalarResult.aggregate(results)
+    def aggregate_score(self, results: List[GroupedScalarResult]) -> Dict[str, ScalarResult]:  # type: ignore[override]
+        label_to_values = defaultdict(list)
+        for item_result in results:
+            for label, label_result in item_result.group_to_scalar.items():
+                label_to_values[label].append(label_result)
+        scores = {
+            label: ScalarResult.aggregate(values)
+            for label, values in label_to_values.items()
+        }
+        return scores
 
     def __call__(
         self, annotations: AnnotationList, predictions: PredictionList
-    ) -> ScalarResult:
+    ) -> GroupedScalarResult:
         if self.confidence_threshold > 0:
             predictions = confidence_filter(
                 predictions, self.confidence_threshold
@@ -119,11 +154,9 @@ def __call__(
         polygon_predictions.extend(predictions.box_predictions)
         polygon_predictions.extend(predictions.polygon_predictions)
 
-        eval_fn = label_match_wrapper(self.eval)
-        result = eval_fn(
+        result = self.eval_grouped(
             polygon_annotations,
             polygon_predictions,
-            enforce_label_match=self.enforce_label_match,
         )
         return result
 
@@ -166,7 +199,7 @@ class PolygonIOU(PolygonMetric):
     # TODO: Remove defaults once these are surfaced more cleanly to users.
     def __init__(
         self,
-        enforce_label_match: bool = False,
+        enforce_label_match: bool = True,
         iou_threshold: float = 0.0,
         confidence_threshold: float = 0.0,
     ):
@@ -234,7 +267,7 @@ class PolygonPrecision(PolygonMetric):
     # TODO: Remove defaults once these are surfaced more cleanly to users.
     def __init__(
         self,
-        enforce_label_match: bool = False,
+        enforce_label_match: bool = True,
         iou_threshold: float = 0.5,
         confidence_threshold: float = 0.0,
     ):
@@ -303,7 +336,7 @@ class PolygonRecall(PolygonMetric):
     # TODO: Remove defaults once these are surfaced more cleanly to users.
     def __init__(
         self,
-        enforce_label_match: bool = False,
+        enforce_label_match: bool = True,
         iou_threshold: float = 0.5,
         confidence_threshold: float = 0.0,
     ):
@@ -460,7 +493,7 @@ def __init__(
             0 <= iou_threshold <= 1
         ), "IoU threshold must be between 0 and 1."
         self.iou_threshold = iou_threshold
-        super().__init__(enforce_label_match=False, confidence_threshold=0)
+        super().__init__(enforce_label_match=True, confidence_threshold=0)
 
     def eval(
         self,
diff --git a/nucleus/metrics/polygon_utils.py b/nucleus/metrics/polygon_utils.py
index 8d746b51..d19bd8de 100644
--- a/nucleus/metrics/polygon_utils.py
+++ b/nucleus/metrics/polygon_utils.py
@@ -273,7 +273,7 @@ def wrapper(
         annotations: List[BoxOrPolygonAnnotation],
         predictions: List[BoxOrPolygonPrediction],
         *args,
-        enforce_label_match: bool = False,
+        enforce_label_match: bool = True,
         **kwargs,
     ) -> ScalarResult:
         # Simply return the metric if we are not enforcing label matches.
diff --git a/pyproject.toml b/pyproject.toml
index bb27ec32..e85193a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ click = ">=7.1.2,<9.0"  # NOTE: COLAB has 7.1.2 and has problems updating
 rich = "^10.15.2"
 shellingham = "^1.4.0"
 scikit-learn = ">=0.24.0"
+pandas = ">=1.0"
 
 [tool.poetry.dev-dependencies]
 poetry = "^1.1.5"
diff --git a/tests/metrics/test_categorization_metrics.py b/tests/metrics/test_categorization_metrics.py
index 98c5407a..0dc47ef5 100644
--- a/tests/metrics/test_categorization_metrics.py
+++ b/tests/metrics/test_categorization_metrics.py
@@ -29,9 +29,10 @@ def test_perfect_match_f1_score():
             )
         )
 
-    assert results
+    assert [res.value for res in results]
     aggregate_result = metric.aggregate_score(results)
-    assert aggregate_result.value == 1
+    for result_label, scalar in aggregate_result.items():
+        assert scalar.value == 1
 
 
 def test_no_match_f1_score():
diff --git a/tests/metrics/test_polygon_metrics.py b/tests/metrics/test_polygon_metrics.py
index 6d7fc8fd..9fe57d75 100644
--- a/tests/metrics/test_polygon_metrics.py
+++ b/tests/metrics/test_polygon_metrics.py
@@ -30,36 +30,18 @@
             PolygonIOU,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_BOX_ANNOTATION_LIST,
-            TEST_BOX_PREDICTION_LIST,
-            PolygonIOU,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_BOX_ANNOTATION_LIST,
             TEST_BOX_PREDICTION_LIST,
             PolygonPrecision,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_BOX_ANNOTATION_LIST,
-            TEST_BOX_PREDICTION_LIST,
-            PolygonPrecision,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_BOX_ANNOTATION_LIST,
             TEST_BOX_PREDICTION_LIST,
             PolygonRecall,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_BOX_ANNOTATION_LIST,
-            TEST_BOX_PREDICTION_LIST,
-            PolygonRecall,
-            {"enforce_label_match": False},
-        ),
         (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
@@ -67,36 +49,18 @@
             PolygonIOU,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_CONVEX_POLYGON_ANNOTATION_LIST,
-            TEST_CONVEX_POLYGON_PREDICTION_LIST,
-            PolygonIOU,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
             TEST_CONVEX_POLYGON_PREDICTION_LIST,
             PolygonPrecision,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_CONVEX_POLYGON_ANNOTATION_LIST,
-            TEST_CONVEX_POLYGON_PREDICTION_LIST,
-            PolygonPrecision,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
             TEST_CONVEX_POLYGON_PREDICTION_LIST,
             PolygonRecall,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_CONVEX_POLYGON_ANNOTATION_LIST,
-            TEST_CONVEX_POLYGON_PREDICTION_LIST,
-            PolygonRecall,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
             TEST_CONVEX_POLYGON_PREDICTION_LIST,
@@ -111,7 +75,8 @@ def test_perfect_match_polygon_metrics(
     # Test metrics on where annotations = predictions perfectly
     metric = metric_fn(**kwargs)
     result = metric(test_annotations, test_predictions)
-    assert_metric_eq(result, ScalarResult(1, len(test_annotations)))
+    for label, result_val in result.items():
+        assert_metric_eq(result_val, ScalarResult(1, 1))
 
 
 @pytest.mark.parametrize(
@@ -123,36 +88,18 @@ def test_perfect_match_polygon_metrics(
             PolygonIOU,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_BOX_ANNOTATION_LIST,
-            TEST_BOX_PREDICTION_LIST,
-            PolygonIOU,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_BOX_ANNOTATION_LIST,
             TEST_BOX_PREDICTION_LIST,
             PolygonPrecision,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_BOX_ANNOTATION_LIST,
-            TEST_BOX_PREDICTION_LIST,
-            PolygonPrecision,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_BOX_ANNOTATION_LIST,
             TEST_BOX_PREDICTION_LIST,
             PolygonRecall,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_BOX_ANNOTATION_LIST,
-            TEST_BOX_PREDICTION_LIST,
-            PolygonRecall,
-            {"enforce_label_match": False},
-        ),
         (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
@@ -160,36 +107,18 @@ def test_perfect_match_polygon_metrics(
             PolygonIOU,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_CONVEX_POLYGON_ANNOTATION_LIST,
-            TEST_CONVEX_POLYGON_PREDICTION_LIST,
-            PolygonIOU,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
             TEST_CONVEX_POLYGON_PREDICTION_LIST,
             PolygonPrecision,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_CONVEX_POLYGON_ANNOTATION_LIST,
-            TEST_CONVEX_POLYGON_PREDICTION_LIST,
-            PolygonPrecision,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
             TEST_CONVEX_POLYGON_PREDICTION_LIST,
             PolygonRecall,
             {"enforce_label_match": True},
         ),
-        (
-            TEST_CONVEX_POLYGON_ANNOTATION_LIST,
-            TEST_CONVEX_POLYGON_PREDICTION_LIST,
-            PolygonRecall,
-            {"enforce_label_match": False},
-        ),
         (
             TEST_CONVEX_POLYGON_ANNOTATION_LIST,
             TEST_CONVEX_POLYGON_PREDICTION_LIST,
@@ -209,7 +138,8 @@ def test_perfect_unmatched_polygon_metrics(
         polygon.reference_id += "_bad"
     metric = metric_fn(**kwargs)
     result = metric(test_annotations, test_predictions_unmatch)
-    assert_metric_eq(result, ScalarResult(0, len(test_annotations)))
+    for label, result in result.items():
+        assert_metric_eq(result, ScalarResult(0, 1))
 
 
 @pytest.mark.parametrize(
@@ -219,56 +149,35 @@ def test_perfect_unmatched_polygon_metrics(
             TEST_ANNOTATION_LIST,
             TEST_PREDICTION_LIST,
             PolygonIOU,
-            ScalarResult(109.0 / 300, 3),
+            {"car": ScalarResult(109.0 / 300, 3)},
             {"enforce_label_match": True},
         ),
-        (
-            TEST_ANNOTATION_LIST,
-            TEST_PREDICTION_LIST,
-            PolygonIOU,
-            ScalarResult(109.0 / 300, 3),
-            {"enforce_label_match": False},
-        ),
         (
             TEST_ANNOTATION_LIST,
             TEST_PREDICTION_LIST,
             PolygonPrecision,
-            ScalarResult(1.0 / 3, 3),
+            {"car": ScalarResult(1.0 / 3, 3)},
             {"enforce_label_match": True},
         ),
-        (
-            TEST_ANNOTATION_LIST,
-            TEST_PREDICTION_LIST,
-            PolygonPrecision,
-            ScalarResult(1.0 / 3, 3),
-            {"enforce_label_match": False},
-        ),
         (
             TEST_ANNOTATION_LIST,
             TEST_PREDICTION_LIST,
             PolygonRecall,
-            ScalarResult(0.5, 2),
+            {"car": ScalarResult(0.5, 2)},
             {"enforce_label_match": True},
         ),
-        (
-            TEST_ANNOTATION_LIST,
-            TEST_PREDICTION_LIST,
-            PolygonRecall,
-            ScalarResult(0.5, 2),
-            {"enforce_label_match": False},
-        ),
         (
             TEST_ANNOTATION_LIST,
             TEST_PREDICTION_LIST,
             PolygonAveragePrecision,
-            ScalarResult(1.0 / 6, 1),
+            {"car": ScalarResult(1.0 / 6, 1)},
             {"label": "car"},
         ),
         (
             TEST_ANNOTATION_LIST,
             TEST_PREDICTION_LIST,
             PolygonMAP,
-            ScalarResult(1.0 / 6, 1),
+            {"car": ScalarResult(1.0 / 6, 1)},
             {},
         ),
     ],
@@ -279,4 +188,6 @@ def test_simple_2_boxes(
     # Test metrics on where annotations = predictions perfectly
     metric = metric_fn(**kwargs)
     result = metric(test_annotations, test_predictions)
-    assert_metric_eq(result, expected)
+    for label, value in result.items():
+        assert label in expected
+        assert_metric_eq(value, expected[label])