remove all_scores

TomeHirata · TomeHirata · commit bb5563ccabc1 · 2025-04-01T16:39:44.000+09:00
diff --git a/docs/docs/tutorials/agents/index.ipynb b/docs/docs/tutorials/agents/index.ipynb
@@ -500,23 +500,20 @@
     "        metric=top5_recall,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(cot)\n",
+    "    result = evaluate(cot)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
+    "    mlflow.log_metric(\"top5_recall\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Claim\": [example.claim for example in eval_set],\n",
     "            \"Expected Titles\": [example.titles for example in eval_set],\n",
-    "            \"Predicted Titles\": outputs,\n",
-    "            \"Top 5 Recall\": all_scores,\n",
+    "            \"Predicted Titles\": [output[1] for output in result.outputs],\n",
+    "            \"Top 5 Recall\": [output[2] for output in result.outputs],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
diff --git a/docs/docs/tutorials/classification_finetuning/index.ipynb b/docs/docs/tutorials/classification_finetuning/index.ipynb
@@ -535,23 +535,20 @@
     "        metric=extraction_correctness_metric,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
+    "    result = evaluate_correctness(people_extractor)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"exact_match\", aggregated_score)\n",
+    "    mlflow.log_metric(\"exact_match\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Text\": [example.text for example in devset],\n",
     "            \"Expected\": [example.example_label for example in devset],\n",
-    "            \"Predicted\": outputs,\n",
-    "            \"Exact match\": all_scores,\n",
+    "            \"Predicted\": [output[1] for output in result.outputs],\n",
+    "            \"Exact match\": [output[2] for output in result.outputs],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
diff --git a/docs/docs/tutorials/entity_extraction/index.ipynb b/docs/docs/tutorials/entity_extraction/index.ipynb
@@ -514,23 +514,20 @@
         "        metric=extraction_correctness_metric,\n",
         "        num_threads=24,\n",
         "        display_progress=True,\n",
-        "        # To record the outputs and detailed scores to MLflow\n",
-        "        return_all_scores=True,\n",
-        "        return_outputs=True,\n",
         "    )\n",
         "\n",
         "    # Evaluate the program as usual\n",
-        "    aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
+        "    result = evaluate_correctness(people_extractor)\n",
         "\n",
         "    # Log the aggregated score\n",
-        "    mlflow.log_metric(\"exact_match\", aggregated_score)\n",
+        "    mlflow.log_metric(\"exact_match\", result.score)\n",
         "    # Log the detailed evaluation results as a table\n",
         "    mlflow.log_table(\n",
         "        {\n",
         "            \"Tokens\": [example.tokens for example in test_set],\n",
         "            \"Expected\": [example.expected_extracted_people for example in test_set],\n",
-        "            \"Predicted\": outputs,\n",
-        "            \"Exact match\": all_scores,\n",
+        "            \"Predicted\": [output[1] for output in result.outputs],\n",
+        "            \"Exact match\": [output[2] for output in result.outputs],\n",
         "        },\n",
         "        artifact_file=\"eval_results.json\",\n",
         "    )\n",
diff --git a/docs/docs/tutorials/math/index.ipynb b/docs/docs/tutorials/math/index.ipynb
@@ -369,21 +369,21 @@
     "\n",
     "# Start an MLflow Run to record the evaluation\n",
     "with mlflow.start_run(run_name=\"math_evaluation\"):\n",
-    "    kwargs = dict(num_threads=THREADS, display_progress=True, return_all_scores=True, return_outputs=True)\n",
+    "    kwargs = dict(num_threads=THREADS, display_progress=True)\n",
     "    evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(module)\n",
+    "    result = evaluate(module)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"correctness\", aggregated_score)\n",
+    "    mlflow.log_metric(\"correctness\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Question\": [example.question for example in dataset.dev],\n",
     "            \"Gold Answer\": [example.answer for example in dataset.dev],\n",
-    "            \"Predicted Answer\": outputs,\n",
-    "            \"Correctness\": all_scores,\n",
+    "            \"Predicted Answer\": [output[1] for output in result.outputs],\n",
+    "            \"Correctness\": [output[2] for output in result.outputs],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
diff --git a/docs/docs/tutorials/multihop_search/index.ipynb b/docs/docs/tutorials/multihop_search/index.ipynb
@@ -534,23 +534,20 @@
     "        metric=top5_recall,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(Hop())\n",
+    "    result = evaluate(Hop())\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
+    "    mlflow.log_metric(\"top5_recall\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Claim\": [example.claim for example in eval_set],\n",
     "            \"Expected Titles\": [example.titles for example in eval_set],\n",
-    "            \"Predicted Titles\": outputs,\n",
-    "            \"Top 5 Recall\": all_scores,\n",
+    "            \"Predicted Titles\": [output[1] for output in result.outputs],\n",
+    "            \"Top 5 Recall\": [output[2] for output in result.outputs],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb
@@ -731,24 +731,21 @@
     "        metric=metric,\n",
     "        num_threads=24,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(cot)\n",
+    "    result = evaluate(cot)\n",
     "\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"semantic_f1_score\", aggregated_score)\n",
+    "    mlflow.log_metric(\"semantic_f1_score\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Question\": [example.question for example in eval_set],\n",
     "            \"Gold Response\": [example.response for example in eval_set],\n",
-    "            \"Predicted Response\": outputs,\n",
-    "            \"Semantic F1 Score\": all_scores,\n",
+    "            \"Predicted Response\": [output[1] for output in result.outputs],\n",
+    "            \"Semantic F1 Score\": [output[2] for output in result.outputs],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
@@ -1471,6 +1468,11 @@
     "\n",
     "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -110,12 +110,7 @@ def __call__(
             
             - score: A float percentage score (e.g., 67.30) representing overall performance
             
-            - all_scores: a list of float scores for each example in devset
-            
             - all_outputs: a list of (example, prediction, score) tuples for each example in devset
-
-            - result_table: a pandas DataFrame containing the evaluation results.
-
         """
         metric = metric if metric is not None else self.metric
         devset = devset if devset is not None else self.devset
@@ -167,9 +162,7 @@ def process_item(example):
         
         return dspy.Prediction(
             score=round(100 * ncorrect / ntotal, 2),
-            all_scores=[score for *_, score in results],
             all_outputs=results,
-            result_table=result_df,
         )
     
 
diff --git a/dspy/primitives/prediction.py b/dspy/primitives/prediction.py
@@ -103,7 +103,9 @@ def __eq__(self, other):
             return self.__float__() == other
         elif isinstance(other, Prediction):
             return self.__float__() == float(other)
-        raise TypeError(f"Unsupported type for comparison: {type(other)}")
+        else:
+            # we should return False when Prediction is compared with other types and 
+            return False
 
     @property
     def completions(self):
diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py
@@ -118,7 +118,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
 
             result = evaluate(program)
 
-            score, subscores = result.score, result.all_scores
+            score, subscores = result.score, [output[2] for output in result.all_outputs]
 
             all_subscores.append(subscores)
 
diff --git a/dspy/teleprompt/utils.py b/dspy/teleprompt/utils.py
@@ -59,7 +59,7 @@ def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rn
     except Exception:
         logger.error("An exception occurred during evaluation", exc_info=True)
         # TODO: Handle this better, as -ve scores are possible
-        return dspy.Prediction(score=0.0, all_scores=[0.0] * len(trainset))
+        return dspy.Prediction(score=0.0, all_outputs=[])
 
 def eval_candidate_program_with_pruning(
     trial, trial_logs, trainset, candidate_program, evaluate, trial_num, batch_size=100,