[Validate] Pass Evaluation Function arguments with EvaluationCriteria (#229)

gatli · web-flow · commit 735e6ec30d7f · 2022-03-31T15:03:58.000+02:00
* Pass eval_func_arguments to backend with EvaluationCriteria * Add better error message for scenario_test misconfiguration and arguments to all public functions * Update defaults to match metrics * Address @phil-scale comments! * Add examples to configuration functions and clear up class naming * Fix rebase errors * Another rebasing error bites the dust * 🤦‍♂️
diff --git a/nucleus/metrics/categorization_metrics.py b/nucleus/metrics/categorization_metrics.py
@@ -143,7 +143,8 @@ def __init__(
     ):
         """
         Args:
-            confidence_threshold: minimum confidence threshold for predictions to be taken into account for evaluation. Must be in [0, 1]. Default 0.0
+            confidence_threshold: minimum confidence threshold for predictions to be taken into account for evaluation.
+                 Must be in [0, 1]. Default 0.0
             f1_method: {'micro', 'macro', 'samples','weighted', 'binary'}, \
                 default='macro'
             This parameter is required for multiclass/multilabel targets.
diff --git a/nucleus/validate/client.py b/nucleus/validate/client.py
@@ -7,10 +7,8 @@
 from .data_transfer_objects.eval_function import GetEvalFunctions
 from .data_transfer_objects.scenario_test import CreateScenarioTestRequest
 from .errors import CreateScenarioTestError
-from .eval_functions.available_eval_functions import (
-    AvailableEvalFunctions,
-    EvalFunction,
-)
+from .eval_functions.available_eval_functions import AvailableEvalFunctions
+from .eval_functions.base_eval_function import EvalFunctionConfig
 from .scenario_test import ScenarioTest
 
 SUCCESS_KEY = "success"
@@ -36,7 +34,8 @@ def eval_functions(self) -> AvailableEvalFunctions:
             import nucleus
             client = nucleus.NucleusClient("YOUR_SCALE_API_KEY")
 
-            scenario_test_criterion = client.validate.eval_functions.bbox_iou() > 0.5  # Creates an EvaluationCriterion by comparison
+            # Creates an EvaluationCriterion by using a comparison op
+            scenario_test_criterion = client.validate.eval_functions.bbox_iou() > 0.5
 
         Returns:
             :class:`AvailableEvalFunctions`: A container for all the available eval functions
@@ -51,7 +50,7 @@ def create_scenario_test(
         self,
         name: str,
         slice_id: str,
-        evaluation_functions: List[EvalFunction],
+        evaluation_functions: List[EvalFunctionConfig],
     ) -> ScenarioTest:
         """Creates a new Scenario Test from an existing Nucleus :class:`Slice`:. ::
 
@@ -78,6 +77,7 @@ def create_scenario_test(
                 "Must pass an evaluation_function to the scenario test! I.e. "
                 "evaluation_functions=[client.validate.eval_functions.bbox_iou()]"
             )
+
         response = self.connection.post(
             CreateScenarioTestRequest(
                 name=name,
diff --git a/nucleus/validate/data_transfer_objects/eval_function.py b/nucleus/validate/data_transfer_objects/eval_function.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 from pydantic import validator
 
@@ -50,12 +50,14 @@ class EvaluationCriterion(ImmutableModel):
         eval_function_id (str): ID of evaluation function
         threshold_comparison (:class:`ThresholdComparison`): comparator for evaluation. i.e. threshold=0.5 and threshold_comparator > implies that a test only passes if score > 0.5.
         threshold (float): numerical threshold that together with threshold comparison, defines success criteria for test evaluation.
+        eval_func_arguments: Arguments to pass to the eval function constructor
     """
 
     # TODO: Having only eval_function_id hurts readability -> Add function name
     eval_function_id: str
     threshold_comparison: ThresholdComparison
     threshold: float
+    eval_func_arguments: Dict[str, Any]
 
     @validator("eval_function_id")
     def valid_eval_function_id(cls, v):  # pylint: disable=no-self-argument
diff --git a/nucleus/validate/eval_functions/available_eval_functions.py b/nucleus/validate/eval_functions/available_eval_functions.py
@@ -1,54 +1,234 @@
 import itertools
-from typing import Callable, Dict, List, Type, Union
+from typing import Callable, Dict, List, Optional, Union
 
 from nucleus.logger import logger
-from nucleus.validate.eval_functions.base_eval_function import BaseEvalFunction
+from nucleus.validate.eval_functions.base_eval_function import (
+    EvalFunctionConfig,
+)
 
 from ..data_transfer_objects.eval_function import EvalFunctionEntry
 from ..errors import EvalFunctionNotAvailableError
 
 MEAN_AVG_PRECISION_NAME = "mean_average_precision_boxes"
 
 
-class BoundingBoxIOU(BaseEvalFunction):
+class PolygonIOUConfig(EvalFunctionConfig):
+    def __call__(
+        self,
+        enforce_label_match: bool = False,
+        iou_threshold: float = 0.0,
+        confidence_threshold: float = 0.0,
+        **kwargs,
+    ):
+        """Configures a call to :class:`PolygonIOU` object.
+        ::
+
+            import nucleus
+
+            client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
+            bbox_iou: BoundingBoxIOU = client.validate.eval_functions.bbox_iou
+            slice_id = "slc_<your_slice>"
+            scenario_test = client.validate.create_scenario_test(
+                "Example test",
+                slice_id=slice_id,
+                evaluation_criteria=[bbox_iou(confidence_threshold=0.8) > 0.5]
+            )
+
+        Args:
+            enforce_label_match: whether to enforce that annotation and prediction labels must match. Defaults to False
+            iou_threshold: IOU threshold to consider detection as valid. Must be in [0, 1]. Default 0.0
+            confidence_threshold: minimum confidence threshold for predictions. Must be in [0, 1]. Default 0.0
+        """
+        return super().__call__(
+            enforce_label_match=enforce_label_match,
+            iou_threshold=iou_threshold,
+            confidence_threshold=confidence_threshold,
+            **kwargs,
+        )
+
     @classmethod
     def expected_name(cls) -> str:
         return "bbox_iou"
 
 
-class BoundingBoxMeanAveragePrecision(BaseEvalFunction):
+class PolygonMAPConfig(EvalFunctionConfig):
+    def __call__(
+        self,
+        iou_threshold: float = 0.5,
+        **kwargs,
+    ):
+        """Configures a call to :class:`PolygonMAP` object.
+        ::
+
+            import nucleus
+
+            client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
+            bbox_map: BoundingBoxMeanAveragePrecision= client.validate.eval_functions.bbox_map
+            slice_id = "slc_<your_slice>"
+            scenario_test = client.validate.create_scenario_test(
+                "Example test",
+                slice_id=slice_id,
+                evaluation_criteria=[bbox_map(iou_threshold=0.6) > 0.8]
+            )
+
+        Args:
+            iou_threshold: IOU threshold to consider detection as valid. Must be in [0, 1]. Default 0.0
+        """
+        return super().__call__(
+            iou_threshold=iou_threshold,
+            **kwargs,
+        )
+
     @classmethod
     def expected_name(cls) -> str:
         return "bbox_map"
 
 
-class BoundingBoxRecall(BaseEvalFunction):
+class PolygonRecallConfig(EvalFunctionConfig):
+    def __call__(
+        self,
+        enforce_label_match: bool = False,
+        iou_threshold: float = 0.5,
+        confidence_threshold: float = 0.0,
+        **kwargs,
+    ):
+        """Configures a call to :class:`PolygonRecall` object.
+        ::
+
+            import nucleus
+
+            client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
+            bbox_recall: BoundingBoxMeanAveragePrecision= client.validate.eval_functions.bbox_recall
+            slice_id = "slc_<your_slice>"
+            scenario_test = client.validate.create_scenario_test(
+                "Example test",
+                slice_id=slice_id,
+                evaluation_criteria=[bbox_recall(iou_threshold=0.6, confidence_threshold=0.4) > 0.9]
+            )
+
+        Args:
+            enforce_label_match: whether to enforce that annotation and prediction labels must match. Defaults to False
+            iou_threshold: IOU threshold to consider detection as valid. Must be in [0, 1]. Default 0.0
+            confidence_threshold: minimum confidence threshold for predictions. Must be in [0, 1]. Default 0.0
+        """
+        return super().__call__(
+            enforce_label_match=enforce_label_match,
+            iou_threshold=iou_threshold,
+            confidence_threshold=confidence_threshold,
+            **kwargs,
+        )
+
     @classmethod
     def expected_name(cls) -> str:
         return "bbox_recall"
 
 
-class BoundingBoxPrecision(BaseEvalFunction):
+class PolygonPrecisionConfig(EvalFunctionConfig):
+    def __call__(
+        self,
+        enforce_label_match: bool = False,
+        iou_threshold: float = 0.5,
+        confidence_threshold: float = 0.0,
+        **kwargs,
+    ):
+        """Configures a call to :class:`PolygonPrecision` object.
+        ::
+
+            import nucleus
+
+            client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
+            bbox_precision: BoundingBoxMeanAveragePrecision= client.validate.eval_functions.bbox_precision
+            slice_id = "slc_<your_slice>"
+            scenario_test = client.validate.create_scenario_test(
+                "Example test",
+                slice_id=slice_id,
+                evaluation_criteria=[bbox_precision(iou_threshold=0.6, confidence_threshold=0.4) > 0.9]
+            )
+
+        Args:
+            enforce_label_match: whether to enforce that annotation and prediction labels must match. Defaults to False
+            iou_threshold: IOU threshold to consider detection as valid. Must be in [0, 1]. Default 0.0
+            confidence_threshold: minimum confidence threshold for predictions. Must be in [0, 1]. Default 0.0
+        """
+        return super().__call__(
+            enforce_label_match=enforce_label_match,
+            iou_threshold=iou_threshold,
+            confidence_threshold=confidence_threshold,
+            **kwargs,
+        )
+
     @classmethod
     def expected_name(cls) -> str:
         return "bbox_precision"
 
 
-class CategorizationF1(BaseEvalFunction):
+class CategorizationF1Config(EvalFunctionConfig):
+    def __call__(
+        self,
+        confidence_threshold: Optional[float] = None,
+        f1_method: Optional[str] = None,
+        **kwargs,
+    ):
+        """ Configure an evaluation of :class:`CategorizationF1`.
+        ::
+
+            import nucleus
+
+            client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
+            cat_f1: CategorizationF1 = client.validate.eval_functions.cat_f1
+            slice_id = "slc_<your_slice>"
+            scenario_test = client.validate.create_scenario_test(
+                "Example test",
+                slice_id=slice_id,
+                evaluation_criteria=[cat_f1(confidence_threshold=0.6, f1_method="weighted") > 0.7]
+            )
+
+        Args:
+            confidence_threshold: minimum confidence threshold for predictions to be taken into account for evaluation.
+                 Must be in [0, 1]. Default 0.0
+            f1_method: {'micro', 'macro', 'samples','weighted', 'binary'}, \
+                default='macro'
+            This parameter is required for multiclass/multilabel targets.
+            If ``None``, the scores for each class are returned. Otherwise, this
+            determines the type of averaging performed on the data:
+
+            ``'binary'``:
+                Only report results for the class specified by ``pos_label``.
+                This is applicable only if targets (``y_{true,pred}``) are binary.
+            ``'micro'``:
+                Calculate metrics globally by counting the total true positives,
+                false negatives and false positives.
+            ``'macro'``:
+                Calculate metrics for each label, and find their unweighted
+                mean.  This does not take label imbalance into account.
+            ``'weighted'``:
+                Calculate metrics for each label, and find their average weighted
+                by support (the number of true instances for each label). This
+                alters 'macro' to account for label imbalance; it can result in an
+                F-score that is not between precision and recall.
+            ``'samples'``:
+                Calculate metrics for each instance, and find their average (only
+                meaningful for multilabel classification where this differs from
+                :func:`accuracy_score`).
+        """
+        return super().__call__(
+            confidence_threshold=confidence_threshold, f1_method=f1_method
+        )
+
     @classmethod
     def expected_name(cls) -> str:
         return "cat_f1"
 
 
-class CustomEvalFunction(BaseEvalFunction):
+class CustomEvalFunction(EvalFunctionConfig):
     @classmethod
     def expected_name(cls) -> str:
         raise NotImplementedError(
             "Custm evaluation functions are coming soon"
         )  # Placeholder: See super().eval_func_entry for actual name
 
 
-class StandardEvalFunction(BaseEvalFunction):
+class StandardEvalFunction(EvalFunctionConfig):
     """Class for standard Model CI eval functions that have not been added as attributes on
     AvailableEvalFunctions yet.
     """
@@ -65,7 +245,7 @@ def expected_name(cls) -> str:
         return "public_function"  # Placeholder: See super().eval_func_entry for actual name
 
 
-class EvalFunctionNotAvailable(BaseEvalFunction):
+class EvalFunctionNotAvailable(EvalFunctionConfig):
     def __init__(
         self, not_available_name: str
     ):  # pylint: disable=super-init-not-called
@@ -89,13 +269,14 @@ def expected_name(cls) -> str:
 
 
 EvalFunction = Union[
-    Type[BoundingBoxIOU],
-    Type[BoundingBoxMeanAveragePrecision],
-    Type[BoundingBoxPrecision],
-    Type[BoundingBoxRecall],
-    Type[CustomEvalFunction],
-    Type[EvalFunctionNotAvailable],
-    Type[StandardEvalFunction],
+    PolygonIOUConfig,
+    PolygonMAPConfig,
+    PolygonPrecisionConfig,
+    PolygonRecallConfig,
+    CategorizationF1Config,
+    CustomEvalFunction,
+    EvalFunctionNotAvailable,
+    StandardEvalFunction,
 ]
 
 
@@ -124,24 +305,24 @@ def __init__(self, available_functions: List[EvalFunctionEntry]):
             f.name: f for f in available_functions if f.is_public
         }
         # NOTE: Public are assigned
-        self._public_to_function: Dict[str, BaseEvalFunction] = {}
+        self._public_to_function: Dict[str, EvalFunctionConfig] = {}
         self._custom_to_function: Dict[str, CustomEvalFunction] = {
             f.name: CustomEvalFunction(f)
             for f in available_functions
             if not f.is_public
         }
-        self.bbox_iou = self._assign_eval_function_if_defined(BoundingBoxIOU)  # type: ignore
-        self.bbox_precision = self._assign_eval_function_if_defined(
-            BoundingBoxPrecision  # type: ignore
+        self.bbox_iou: PolygonIOUConfig = self._assign_eval_function_if_defined(PolygonIOUConfig)  # type: ignore
+        self.bbox_precision: PolygonPrecisionConfig = self._assign_eval_function_if_defined(
+            PolygonPrecisionConfig  # type: ignore
         )
-        self.bbox_recall = self._assign_eval_function_if_defined(
-            BoundingBoxRecall  # type: ignore
+        self.bbox_recall: PolygonRecallConfig = self._assign_eval_function_if_defined(
+            PolygonRecallConfig  # type: ignore
         )
-        self.bbox_map = self._assign_eval_function_if_defined(
-            BoundingBoxMeanAveragePrecision  # type: ignore
+        self.bbox_map: PolygonMAPConfig = self._assign_eval_function_if_defined(
+            PolygonMAPConfig  # type: ignore
         )
-        self.cat_f1 = self._assign_eval_function_if_defined(
-            CategorizationF1  # type: ignore
+        self.cat_f1: CategorizationF1Config = self._assign_eval_function_if_defined(
+            CategorizationF1Config  # type: ignore
         )
 
         # Add public entries that have not been implemented as an attribute on this class
@@ -163,7 +344,7 @@ def __repr__(self):
         )
 
     @property
-    def public_functions(self) -> Dict[str, BaseEvalFunction]:
+    def public_functions(self) -> Dict[str, EvalFunctionConfig]:
         """Standard functions provided by Model CI.
 
         Notes:
diff --git a/nucleus/validate/eval_functions/base_eval_function.py b/nucleus/validate/eval_functions/base_eval_function.py
diff --git a/tests/validate/test_scenario_test.py b/tests/validate/test_scenario_test.py