Skip to content

Commit bb5563c

Browse files
committed
remove all_scores
1 parent 302ba77 commit bb5563c

File tree

10 files changed

+35
-50
lines changed

10 files changed

+35
-50
lines changed

docs/docs/tutorials/agents/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -500,23 +500,20 @@
500500
" metric=top5_recall,\n",
501501
" num_threads=16,\n",
502502
" display_progress=True,\n",
503-
" # To record the outputs and detailed scores to MLflow\n",
504-
" return_all_scores=True,\n",
505-
" return_outputs=True,\n",
506503
" )\n",
507504
"\n",
508505
" # Evaluate the program as usual\n",
509-
" aggregated_score, outputs, all_scores = evaluate(cot)\n",
506+
" result = evaluate(cot)\n",
510507
"\n",
511508
" # Log the aggregated score\n",
512-
" mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
509+
" mlflow.log_metric(\"top5_recall\", result.score)\n",
513510
" # Log the detailed evaluation results as a table\n",
514511
" mlflow.log_table(\n",
515512
" {\n",
516513
" \"Claim\": [example.claim for example in eval_set],\n",
517514
" \"Expected Titles\": [example.titles for example in eval_set],\n",
518-
" \"Predicted Titles\": outputs,\n",
519-
" \"Top 5 Recall\": all_scores,\n",
515+
" \"Predicted Titles\": [output[1] for output in result.outputs],\n",
516+
" \"Top 5 Recall\": [output[2] for output in result.outputs],\n",
520517
" },\n",
521518
" artifact_file=\"eval_results.json\",\n",
522519
" )\n",

docs/docs/tutorials/classification_finetuning/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -535,23 +535,20 @@
535535
" metric=extraction_correctness_metric,\n",
536536
" num_threads=16,\n",
537537
" display_progress=True,\n",
538-
" # To record the outputs and detailed scores to MLflow\n",
539-
" return_all_scores=True,\n",
540-
" return_outputs=True,\n",
541538
" )\n",
542539
"\n",
543540
" # Evaluate the program as usual\n",
544-
" aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
541+
" result = evaluate_correctness(people_extractor)\n",
545542
"\n",
546543
" # Log the aggregated score\n",
547-
" mlflow.log_metric(\"exact_match\", aggregated_score)\n",
544+
" mlflow.log_metric(\"exact_match\", result.score)\n",
548545
" # Log the detailed evaluation results as a table\n",
549546
" mlflow.log_table(\n",
550547
" {\n",
551548
" \"Text\": [example.text for example in devset],\n",
552549
" \"Expected\": [example.example_label for example in devset],\n",
553-
" \"Predicted\": outputs,\n",
554-
" \"Exact match\": all_scores,\n",
550+
" \"Predicted\": [output[1] for output in result.outputs],\n",
551+
" \"Exact match\": [output[2] for output in result.outputs],\n",
555552
" },\n",
556553
" artifact_file=\"eval_results.json\",\n",
557554
" )\n",

docs/docs/tutorials/entity_extraction/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -514,23 +514,20 @@
514514
" metric=extraction_correctness_metric,\n",
515515
" num_threads=24,\n",
516516
" display_progress=True,\n",
517-
" # To record the outputs and detailed scores to MLflow\n",
518-
" return_all_scores=True,\n",
519-
" return_outputs=True,\n",
520517
" )\n",
521518
"\n",
522519
" # Evaluate the program as usual\n",
523-
" aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
520+
" result = evaluate_correctness(people_extractor)\n",
524521
"\n",
525522
" # Log the aggregated score\n",
526-
" mlflow.log_metric(\"exact_match\", aggregated_score)\n",
523+
" mlflow.log_metric(\"exact_match\", result.score)\n",
527524
" # Log the detailed evaluation results as a table\n",
528525
" mlflow.log_table(\n",
529526
" {\n",
530527
" \"Tokens\": [example.tokens for example in test_set],\n",
531528
" \"Expected\": [example.expected_extracted_people for example in test_set],\n",
532-
" \"Predicted\": outputs,\n",
533-
" \"Exact match\": all_scores,\n",
529+
" \"Predicted\": [output[1] for output in result.outputs],\n",
530+
" \"Exact match\": [output[2] for output in result.outputs],\n",
534531
" },\n",
535532
" artifact_file=\"eval_results.json\",\n",
536533
" )\n",

docs/docs/tutorials/math/index.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -369,21 +369,21 @@
369369
"\n",
370370
"# Start an MLflow Run to record the evaluation\n",
371371
"with mlflow.start_run(run_name=\"math_evaluation\"):\n",
372-
" kwargs = dict(num_threads=THREADS, display_progress=True, return_all_scores=True, return_outputs=True)\n",
372+
" kwargs = dict(num_threads=THREADS, display_progress=True)\n",
373373
" evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n",
374374
"\n",
375375
" # Evaluate the program as usual\n",
376-
" aggregated_score, outputs, all_scores = evaluate(module)\n",
376+
" result = evaluate(module)\n",
377377
"\n",
378378
" # Log the aggregated score\n",
379-
" mlflow.log_metric(\"correctness\", aggregated_score)\n",
379+
" mlflow.log_metric(\"correctness\", result.score)\n",
380380
" # Log the detailed evaluation results as a table\n",
381381
" mlflow.log_table(\n",
382382
" {\n",
383383
" \"Question\": [example.question for example in dataset.dev],\n",
384384
" \"Gold Answer\": [example.answer for example in dataset.dev],\n",
385-
" \"Predicted Answer\": outputs,\n",
386-
" \"Correctness\": all_scores,\n",
385+
" \"Predicted Answer\": [output[1] for output in result.outputs],\n",
386+
" \"Correctness\": [output[2] for output in result.outputs],\n",
387387
" },\n",
388388
" artifact_file=\"eval_results.json\",\n",
389389
" )\n",

docs/docs/tutorials/multihop_search/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -534,23 +534,20 @@
534534
" metric=top5_recall,\n",
535535
" num_threads=16,\n",
536536
" display_progress=True,\n",
537-
" # To record the outputs and detailed scores to MLflow\n",
538-
" return_all_scores=True,\n",
539-
" return_outputs=True,\n",
540537
" )\n",
541538
"\n",
542539
" # Evaluate the program as usual\n",
543-
" aggregated_score, outputs, all_scores = evaluate(Hop())\n",
540+
" result = evaluate(Hop())\n",
544541
"\n",
545542
" # Log the aggregated score\n",
546-
" mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
543+
" mlflow.log_metric(\"top5_recall\", result.score)\n",
547544
" # Log the detailed evaluation results as a table\n",
548545
" mlflow.log_table(\n",
549546
" {\n",
550547
" \"Claim\": [example.claim for example in eval_set],\n",
551548
" \"Expected Titles\": [example.titles for example in eval_set],\n",
552-
" \"Predicted Titles\": outputs,\n",
553-
" \"Top 5 Recall\": all_scores,\n",
549+
" \"Predicted Titles\": [output[1] for output in result.outputs],\n",
550+
" \"Top 5 Recall\": [output[2] for output in result.outputs],\n",
554551
" },\n",
555552
" artifact_file=\"eval_results.json\",\n",
556553
" )\n",

docs/docs/tutorials/rag/index.ipynb

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -731,24 +731,21 @@
731731
" metric=metric,\n",
732732
" num_threads=24,\n",
733733
" display_progress=True,\n",
734-
" # To record the outputs and detailed scores to MLflow\n",
735-
" return_all_scores=True,\n",
736-
" return_outputs=True,\n",
737734
" )\n",
738735
"\n",
739736
" # Evaluate the program as usual\n",
740-
" aggregated_score, outputs, all_scores = evaluate(cot)\n",
737+
" result = evaluate(cot)\n",
741738
"\n",
742739
"\n",
743740
" # Log the aggregated score\n",
744-
" mlflow.log_metric(\"semantic_f1_score\", aggregated_score)\n",
741+
" mlflow.log_metric(\"semantic_f1_score\", result.score)\n",
745742
" # Log the detailed evaluation results as a table\n",
746743
" mlflow.log_table(\n",
747744
" {\n",
748745
" \"Question\": [example.question for example in eval_set],\n",
749746
" \"Gold Response\": [example.response for example in eval_set],\n",
750-
" \"Predicted Response\": outputs,\n",
751-
" \"Semantic F1 Score\": all_scores,\n",
747+
" \"Predicted Response\": [output[1] for output in result.outputs],\n",
748+
" \"Semantic F1 Score\": [output[2] for output in result.outputs],\n",
752749
" },\n",
753750
" artifact_file=\"eval_results.json\",\n",
754751
" )\n",
@@ -1471,6 +1468,11 @@
14711468
"\n",
14721469
"The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
14731470
]
1471+
},
1472+
{
1473+
"cell_type": "markdown",
1474+
"metadata": {},
1475+
"source": []
14741476
}
14751477
],
14761478
"metadata": {

dspy/evaluate/evaluate.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,7 @@ def __call__(
110110
111111
- score: A float percentage score (e.g., 67.30) representing overall performance
112112
113-
- all_scores: a list of float scores for each example in devset
114-
115113
- all_outputs: a list of (example, prediction, score) tuples for each example in devset
116-
117-
- result_table: a pandas DataFrame containing the evaluation results.
118-
119114
"""
120115
metric = metric if metric is not None else self.metric
121116
devset = devset if devset is not None else self.devset
@@ -167,9 +162,7 @@ def process_item(example):
167162

168163
return dspy.Prediction(
169164
score=round(100 * ncorrect / ntotal, 2),
170-
all_scores=[score for *_, score in results],
171165
all_outputs=results,
172-
result_table=result_df,
173166
)
174167

175168

dspy/primitives/prediction.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,9 @@ def __eq__(self, other):
103103
return self.__float__() == other
104104
elif isinstance(other, Prediction):
105105
return self.__float__() == float(other)
106-
raise TypeError(f"Unsupported type for comparison: {type(other)}")
106+
else:
107+
# we should return False when Prediction is compared with other types and
108+
return False
107109

108110
@property
109111
def completions(self):

dspy/teleprompt/random_search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
118118

119119
result = evaluate(program)
120120

121-
score, subscores = result.score, result.all_scores
121+
score, subscores = result.score, [output[2] for output in result.all_outputs]
122122

123123
all_subscores.append(subscores)
124124

dspy/teleprompt/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rn
5959
except Exception:
6060
logger.error("An exception occurred during evaluation", exc_info=True)
6161
# TODO: Handle this better, as -ve scores are possible
62-
return dspy.Prediction(score=0.0, all_scores=[0.0] * len(trainset))
62+
return dspy.Prediction(score=0.0, all_outputs=[])
6363

6464
def eval_candidate_program_with_pruning(
6565
trial, trial_logs, trainset, candidate_program, evaluate, trial_num, batch_size=100,

0 commit comments

Comments
 (0)