From 3c5c8529d20a3f6de64c9b56acf6a14079beddf2 Mon Sep 17 00:00:00 2001 From: Crawford Collins Date: Wed, 27 Aug 2025 21:24:55 -0500 Subject: [PATCH 1/3] adding save_as_csv and save_as_json to evaluate --- dspy/evaluate/evaluate.py | 98 ++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 721b34af3e..7142d20600 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -1,7 +1,9 @@ +import csv import importlib +import json import logging import types -from typing import TYPE_CHECKING, Any, Callable +from typing import Any, Callable, Optional, TYPE_CHECKING, Union if TYPE_CHECKING: import pandas as pd @@ -27,6 +29,7 @@ def display(obj: Any): """ print(obj) + def HTML(x: str) -> str: # noqa: N802 """ Obtain the HTML representation of the specified string. @@ -36,7 +39,6 @@ def HTML(x: str) -> str: # noqa: N802 # available, this method will simply return the input string. return x - # TODO: Counting failures and having a max_failure count. When that is exceeded (also just at the end), # we print the number of failures, the first N examples that failed, and the first N exceptions raised. @@ -67,17 +69,19 @@ class Evaluate: """ def __init__( - self, - *, - devset: list["dspy.Example"], - metric: Callable | None = None, - num_threads: int | None = None, - display_progress: bool = False, - display_table: bool | int = False, - max_errors: int | None = None, - provide_traceback: bool | None = None, - failure_score: float = 0.0, - **kwargs, + self, + *, + devset: list["dspy.Example"], + metric: Callable | None = None, + num_threads: int | None = None, + display_progress: bool = False, + display_table: bool | int = False, + max_errors: int | None = None, + provide_traceback: bool | None = None, + failure_score: float = 0.0, + save_as_csv=None, + save_as_json=None, + **kwargs, ): """ Args: @@ -100,20 +104,26 @@ def __init__( self.max_errors = max_errors self.provide_traceback = provide_traceback self.failure_score = failure_score + self.save_as_csv = save_as_csv + self.save_as_json = save_as_json if "return_outputs" in kwargs: - raise ValueError("`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object.") + raise ValueError( + "`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object.") @with_callbacks def __call__( - self, - program: "dspy.Module", - metric: Callable | None = None, - devset: list["dspy.Example"] | None = None, - num_threads: int | None = None, - display_progress: bool | None = None, - display_table: bool | int | None = None, - callback_metadata: dict[str, Any] | None = None, + self, + program: "dspy.Module", + metric: Callable | None = None, + devset: list["dspy.Example"] | None = None, + num_threads: int | None = None, + display_progress: bool | None = None, + display_table: bool | int | None = None, + callback_metadata: dict[str, Any] | None = None, + save_as_csv: Optional[str] = None, + save_as_json: Optional[str] = None, + ) -> EvaluationResult: """ Args: @@ -140,6 +150,8 @@ def __call__( num_threads = num_threads if num_threads is not None else self.num_threads display_progress = display_progress if display_progress is not None else self.display_progress display_table = display_table if display_table is not None else self.display_table + save_as_csv = save_as_csv if save_as_csv is not None else self.save_as_csv + save_as_json = save_as_json if save_as_json is not None else self.save_as_json if callback_metadata: logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}") @@ -179,13 +191,41 @@ def process_item(example): else: logger.warning("Skipping table display since `pandas` is not installed.") + if save_as_csv: + metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__ + data = self._prepare_results_output(results, metric_name) + + with open(save_as_csv, 'w', newline='') as csvfile: + fieldnames = data[0].keys() + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + writer.writeheader() + for row in data: + writer.writerow(row) + if save_as_json: + metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__ + data = self._prepare_results_output(results, metric_name) + with open(save_as_json, 'w', ) as f: + json.dump(data, f) + return EvaluationResult( score=round(100 * ncorrect / ntotal, 2), results=results, ) + @staticmethod + def _prepare_results_output(results: list[tuple[dspy.Example, dspy.Example, Any]], metric_name: str): + return [ + ( + merge_dicts(example, prediction) | {metric_name: score} + if prediction_is_dictlike(prediction) + else dict(example) | {"prediction": prediction, metric_name: score} + ) + for example, prediction, score in results + ] + def _construct_result_table( - self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str + self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str ) -> "pd.DataFrame": """ Construct a pandas DataFrame from the specified result list. @@ -200,14 +240,7 @@ def _construct_result_table( """ import pandas as pd - data = [ - ( - merge_dicts(example, prediction) | {"correct": score} - if prediction_is_dictlike(prediction) - else dict(example) | {"prediction": prediction, "correct": score} - ) - for example, prediction, score in results - ] + data = self._prepare_results_output(results, metric_name) # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0) result_df = pd.DataFrame(data) @@ -308,7 +341,7 @@ def display_dataframe(df: "pd.DataFrame"): else: # Pretty print the DataFrame to the console with pd.option_context( - "display.max_rows", None, "display.max_columns", None + "display.max_rows", None, "display.max_columns", None ): # more options can be specified also print(df) @@ -335,7 +368,6 @@ def is_in_ipython_notebook_environment(): except ImportError: return False - # FIXME: TODO: The merge_dicts stuff above is way too quick and dirty. # TODO: the display_table can't handle False but can handle 0! # Not sure how it works with True exactly, probably fails too. From 5fc2d00858d00bcf02257a57dd1607c88fdccf67 Mon Sep 17 00:00:00 2001 From: Crawford Collins Date: Wed, 27 Aug 2025 21:32:54 -0500 Subject: [PATCH 2/3] reformat --- dspy/evaluate/evaluate.py | 148 +++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 48 deletions(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 7142d20600..66f9c65606 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -3,7 +3,7 @@ import json import logging import types -from typing import Any, Callable, Optional, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Any, Callable if TYPE_CHECKING: import pandas as pd @@ -29,7 +29,6 @@ def display(obj: Any): """ print(obj) - def HTML(x: str) -> str: # noqa: N802 """ Obtain the HTML representation of the specified string. @@ -39,6 +38,7 @@ def HTML(x: str) -> str: # noqa: N802 # available, this method will simply return the input string. return x + # TODO: Counting failures and having a max_failure count. When that is exceeded (also just at the end), # we print the number of failures, the first N examples that failed, and the first N exceptions raised. @@ -54,7 +54,9 @@ class EvaluationResult(Prediction): - results: a list of (example, prediction, score) tuples for each example in devset """ - def __init__(self, score: float, results: list[tuple["dspy.Example", "dspy.Example", Any]]): + def __init__( + self, score: float, results: list[tuple["dspy.Example", "dspy.Example", Any]] + ): super().__init__(score=score, results=results) def __repr__(self): @@ -69,19 +71,19 @@ class Evaluate: """ def __init__( - self, - *, - devset: list["dspy.Example"], - metric: Callable | None = None, - num_threads: int | None = None, - display_progress: bool = False, - display_table: bool | int = False, - max_errors: int | None = None, - provide_traceback: bool | None = None, - failure_score: float = 0.0, - save_as_csv=None, - save_as_json=None, - **kwargs, + self, + *, + devset: list["dspy.Example"], + metric: Callable | None = None, + num_threads: int | None = None, + display_progress: bool = False, + display_table: bool | int = False, + max_errors: int | None = None, + provide_traceback: bool | None = None, + failure_score: float = 0.0, + save_as_csv=None, + save_as_json=None, + **kwargs, ): """ Args: @@ -109,21 +111,21 @@ def __init__( if "return_outputs" in kwargs: raise ValueError( - "`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object.") + "`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object." + ) @with_callbacks def __call__( - self, - program: "dspy.Module", - metric: Callable | None = None, - devset: list["dspy.Example"] | None = None, - num_threads: int | None = None, - display_progress: bool | None = None, - display_table: bool | int | None = None, - callback_metadata: dict[str, Any] | None = None, - save_as_csv: Optional[str] = None, - save_as_json: Optional[str] = None, - + self, + program: "dspy.Module", + metric: Callable | None = None, + devset: list["dspy.Example"] | None = None, + num_threads: int | None = None, + display_progress: bool | None = None, + display_table: bool | int | None = None, + callback_metadata: dict[str, Any] | None = None, + save_as_csv: str | None = None, + save_as_json: str | None = None, ) -> EvaluationResult: """ Args: @@ -148,20 +150,30 @@ def __call__( metric = metric if metric is not None else self.metric devset = devset if devset is not None else self.devset num_threads = num_threads if num_threads is not None else self.num_threads - display_progress = display_progress if display_progress is not None else self.display_progress - display_table = display_table if display_table is not None else self.display_table + display_progress = ( + display_progress if display_progress is not None else self.display_progress + ) + display_table = ( + display_table if display_table is not None else self.display_table + ) save_as_csv = save_as_csv if save_as_csv is not None else self.save_as_csv save_as_json = save_as_json if save_as_json is not None else self.save_as_json if callback_metadata: - logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}") + logger.debug( + f"Evaluate is called with callback metadata: {callback_metadata}" + ) tqdm.tqdm._instances.clear() executor = ParallelExecutor( num_threads=num_threads, disable_progress_bar=not display_progress, - max_errors=(self.max_errors if self.max_errors is not None else dspy.settings.max_errors), + max_errors=( + self.max_errors + if self.max_errors is not None + else dspy.settings.max_errors + ), provide_traceback=self.provide_traceback, compare_results=True, ) @@ -174,28 +186,46 @@ def process_item(example): results = executor.execute(process_item, devset) assert len(devset) == len(results) - results = [((dspy.Prediction(), self.failure_score) if r is None else r) for r in results] - results = [(example, prediction, score) for example, (prediction, score) in zip(devset, results, strict=False)] + results = [ + ((dspy.Prediction(), self.failure_score) if r is None else r) + for r in results + ] + results = [ + (example, prediction, score) + for example, (prediction, score) in zip(devset, results, strict=False) + ] ncorrect, ntotal = sum(score for *_, score in results), len(devset) - logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)") + logger.info( + f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)" + ) if display_table: if importlib.util.find_spec("pandas") is not None: # Rename the 'correct' column to the name of the metric object - metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__ + metric_name = ( + metric.__name__ + if isinstance(metric, types.FunctionType) + else metric.__class__.__name__ + ) # Construct a pandas DataFrame from the results result_df = self._construct_result_table(results, metric_name) self._display_result_table(result_df, display_table, metric_name) else: - logger.warning("Skipping table display since `pandas` is not installed.") + logger.warning( + "Skipping table display since `pandas` is not installed." + ) if save_as_csv: - metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__ + metric_name = ( + metric.__name__ + if isinstance(metric, types.FunctionType) + else metric.__class__.__name__ + ) data = self._prepare_results_output(results, metric_name) - with open(save_as_csv, 'w', newline='') as csvfile: + with open(save_as_csv, "w", newline="") as csvfile: fieldnames = data[0].keys() writer = csv.DictWriter(csvfile, fieldnames=fieldnames) @@ -203,9 +233,16 @@ def process_item(example): for row in data: writer.writerow(row) if save_as_json: - metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__ + metric_name = ( + metric.__name__ + if isinstance(metric, types.FunctionType) + else metric.__class__.__name__ + ) data = self._prepare_results_output(results, metric_name) - with open(save_as_json, 'w', ) as f: + with open( + save_as_json, + "w", + ) as f: json.dump(data, f) return EvaluationResult( @@ -214,7 +251,9 @@ def process_item(example): ) @staticmethod - def _prepare_results_output(results: list[tuple[dspy.Example, dspy.Example, Any]], metric_name: str): + def _prepare_results_output( + results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str + ): return [ ( merge_dicts(example, prediction) | {metric_name: score} @@ -225,7 +264,9 @@ def _prepare_results_output(results: list[tuple[dspy.Example, dspy.Example, Any] ] def _construct_result_table( - self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str + self, + results: list[tuple["dspy.Example", "dspy.Example", Any]], + metric_name: str, ) -> "pd.DataFrame": """ Construct a pandas DataFrame from the specified result list. @@ -244,11 +285,17 @@ def _construct_result_table( # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0) result_df = pd.DataFrame(data) - result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell) + result_df = ( + result_df.map(truncate_cell) + if hasattr(result_df, "map") + else result_df.applymap(truncate_cell) + ) return result_df.rename(columns={"correct": metric_name}) - def _display_result_table(self, result_df: "pd.DataFrame", display_table: bool | int, metric_name: str): + def _display_result_table( + self, result_df: "pd.DataFrame", display_table: bool | int, metric_name: str + ): """ Display the specified result DataFrame in a table format. @@ -323,7 +370,9 @@ def stylize_metric_name(df: "pd.DataFrame", metric_name: str) -> "pd.DataFrame": :param metric_name: The name of the metric for which to stylize DataFrame cell contents. """ df[metric_name] = df[metric_name].apply( - lambda x: f"✔️ [{x:.3f}]" if x and isinstance(x, float) else f"✔️ [{x}]" if x else "" + lambda x: ( + f"✔️ [{x:.3f}]" if x and isinstance(x, float) else f"✔️ [{x}]" if x else "" + ) ) return df @@ -341,12 +390,14 @@ def display_dataframe(df: "pd.DataFrame"): else: # Pretty print the DataFrame to the console with pd.option_context( - "display.max_rows", None, "display.max_columns", None + "display.max_rows", None, "display.max_columns", None ): # more options can be specified also print(df) -def configure_dataframe_for_ipython_notebook_display(df: "pd.DataFrame") -> "pd.DataFrame": +def configure_dataframe_for_ipython_notebook_display( + df: "pd.DataFrame", +) -> "pd.DataFrame": """Set various pandas display options for DataFrame in an IPython notebook environment.""" import pandas as pd @@ -368,6 +419,7 @@ def is_in_ipython_notebook_environment(): except ImportError: return False + # FIXME: TODO: The merge_dicts stuff above is way too quick and dirty. # TODO: the display_table can't handle False but can handle 0! # Not sure how it works with True exactly, probably fails too. From 0ed15aeac3392b07e85aea05a4238450b0d2b38a Mon Sep 17 00:00:00 2001 From: Crawford Collins Date: Wed, 10 Sep 2025 17:20:33 -0500 Subject: [PATCH 3/3] more whitespace fixes --- .../self-contained-example/README.md | 44 +++ .../self-contained-example/vocab_agent.py | 300 ++++++++++++++++++ .../self-contained-example/vocab_examples.csv | 155 +++++++++ 3 files changed, 499 insertions(+) create mode 100644 docs/docs/tutorials/self-contained-example/README.md create mode 100644 docs/docs/tutorials/self-contained-example/vocab_agent.py create mode 100644 docs/docs/tutorials/self-contained-example/vocab_examples.csv diff --git a/docs/docs/tutorials/self-contained-example/README.md b/docs/docs/tutorials/self-contained-example/README.md new file mode 100644 index 0000000000..59d9121767 --- /dev/null +++ b/docs/docs/tutorials/self-contained-example/README.md @@ -0,0 +1,44 @@ +# DSPy Simple Optimizer Tutorial + +This tutorial demonstrates DSPy optimization techniques using a French-English vocabulary translation task. The script +includes 155 translation examples and supports three different optimizers: BootstrapFewShot, COPRO, and MIPROv2. + +## Requirements + +## Setup + +The script is configured to use LM Studio with a local model. Ensure LM Studio is running on `http://localhost:1234` +with a loaded model. If you want to use your a different model, change it in the dspy configuration. + +```bash +pip install dspy +``` + +This script uses the unsloth version of Google Gemma 1 (gemma-3-1b-it-GGUF/gemma-3-1b-it-Q8_0.gguf). Which you will +likely need to download with the lm studio tool. + +## Usage + +### Basic Evaluation + +```bash +python vocab_agent.py +``` + +### Optimization + +```bash +# BootstrapFewShot (recommended) +python vocab_agent.py --optimize bootstrap + +# COPRO +python vocab_agent.py --optimize copro + +# MIPROv2 +python vocab_agent.py --optimize miprov2 +``` + +## Dataset + +The `vocab_examples.csv` contains 155 French-English translation pairs with correct/incorrect labels for training and +evaluation. \ No newline at end of file diff --git a/docs/docs/tutorials/self-contained-example/vocab_agent.py b/docs/docs/tutorials/self-contained-example/vocab_agent.py new file mode 100644 index 0000000000..a313f63e07 --- /dev/null +++ b/docs/docs/tutorials/self-contained-example/vocab_agent.py @@ -0,0 +1,300 @@ +import csv +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import dspy +from dspy import BootstrapFewShot, COPRO, Evaluate, MIPROv2 + + +# Data models for structured output +@dataclass +class VocabTranslationResult: + """Result of vocabulary translation evaluation.""" + + is_correct: int + correct_translation: str + explanation: str + target_word: str + + def to_dict(self) -> Dict[str, Any]: + return { + "is_correct": self.is_correct, + "correct_translation": self.correct_translation, + "explanation": self.explanation, + "target_word": self.target_word, + } + + +# Enhanced signature with more detailed instructions +class DetailedVocabularyTranslation(dspy.Signature): + """ + Evaluate vocabulary translation as an expert French tutor. + Consider common synonyms and variations. + Don't mark wrong if user didn't provide every possible translation. + If the word seems non-standard or rare, mark as correct. + """ + + source_word: str = dspy.InputField(desc="Word to translate from source language") + target_word: str = dspy.InputField(desc="User's translation attempt") + source_lang: str = dspy.InputField(desc="Source language") + target_lang: str = dspy.InputField(desc="Target language") + + is_correct: int = dspy.OutputField(desc="1 for correct, 0 for incorrect") + correct_translation: str = dspy.OutputField(desc="Best translation of the source word") + explanation: str = dspy.OutputField(desc="Why the translation is correct/incorrect") + + +class VocabularyTranslationAgent(dspy.Module): + """DSPy module for evaluating vocabulary translations with structured output.""" + + def __init__(self): + super().__init__() + + # Use TypedPredictor for structured output + self.evaluate = DetailedVocabularyTranslation + + def forward(self, word: str, translation: str, from_french_to_english: bool = True) -> VocabTranslationResult: + """ + Evaluate if a vocabulary translation is correct. + + Args: + word: The word to translate + translation: The user's translation + from_french_to_english: Direction of translation + + Returns: + VocabTranslationResult with evaluation details + """ + # Determine source and target languages + if from_french_to_english: + source_lang = "French" + target_lang = "English" + source_word = word + target_word = translation + else: + source_lang = "English" + target_lang = "French" + source_word = translation + target_word = word + + # Get structured response directly + result = self.evaluate( + source_word=source_word, target_word=target_word, source_lang=source_lang, target_lang=target_lang + ) + + return VocabTranslationResult( + is_correct=result.is_correct, + correct_translation=result.correct_translation, + explanation=result.explanation, + target_word=target_word, + ) + + +class EnhancedVocabularyAgent(dspy.Module): + """Enhanced vocabulary translation agent with chain of thought reasoning.""" + + def __init__(self, model: Optional[str] = None): + super().__init__() + + # Use ChainOfThought for better reasoning + self.evaluate = dspy.ChainOfThought(DetailedVocabularyTranslation) + + if model: + self.lm = dspy.LM(model=model) + dspy.configure(lm=self.lm) + + def forward(self, word: str, translation: str, from_french_to_english: bool = True) -> VocabTranslationResult: + """Evaluate with enhanced reasoning.""" + # Prepare languages and words + if from_french_to_english: + source_lang, target_lang = "French", "English" + source_word, target_word = word, translation + else: + source_lang, target_lang = "English", "French" + source_word, target_word = translation, word + + # Evaluate with reasoning + result = self.evaluate( + source_word=source_word, target_word=target_word, source_lang=source_lang, target_lang=target_lang + ) + + return VocabTranslationResult( + is_correct=result.is_correct, + correct_translation=result.correct_translation, + explanation=result.explanation, + target_word=target_word, + ) + + +def load_examples_from_csv(csv_path: str = "vocab_examples.csv") -> List[dspy.Example]: + """Load vocabulary examples from CSV file.""" + examples = [] + + with open(csv_path, 'r', encoding='utf-8') as file: + reader = csv.DictReader(file) + for row in reader: + example = dspy.Example( + source_word=row['source_word'], + target_word=row['target_word'], + source_lang=row['source_lang'], + target_lang=row['target_lang'], + expected=int(row['expected']) + ).with_inputs("source_word", "target_word", "source_lang", "target_lang") + examples.append(example) + + return examples + + +def score_translation(example, prediction, trace=None, pred_name=None, pred_trace=None) -> float: + """Score a translation result against expected outcome.""" + # Handle different calling signatures from GEPA vs regular evaluation + if hasattr(prediction, 'is_correct'): + return example["expected"] == prediction.is_correct + else: + # For GEPA, prediction might be the raw output + return 0.0 # Default fallback + + +def split_dataset(examples: List[dspy.Example], train_ratio: float = 0.8) -> tuple[ + List[dspy.Example], List[dspy.Example]]: + """Split dataset into training and validation sets.""" + import random + random.seed(42) # For reproducible splits + + shuffled = examples.copy() + random.shuffle(shuffled) + + split_idx = int(len(shuffled) * train_ratio) + train_set = shuffled[:split_idx] + val_set = shuffled[split_idx:] + + return train_set, val_set + + +def run_optimization(examples: List[dspy.Example], optimizer_type: str = "bootstrap") -> dspy.Module: + """Run DSPy optimization with different optimizers.""" + + # Split dataset + train_set, val_set = split_dataset(examples) + print(f"Training set: {len(train_set)} examples") + print(f"Validation set: {len(val_set)} examples") + + # Create student program (the one to optimize) + student = dspy.ChainOfThought(DetailedVocabularyTranslation) + + # Setup optimizer based on type + if optimizer_type == "bootstrap": + print("Using BootstrapFewShot optimizer...") + optimizer = BootstrapFewShot( + metric=score_translation, + max_bootstrapped_demos=8, # Number of examples to bootstrap + max_labeled_demos=4, # Max labeled demonstrations + ) + elif optimizer_type == "copro": + print("Using COPRO optimizer...") + optimizer = COPRO( + metric=score_translation, + breadth=10, + depth=3, + init_temperature=1.0 + ) + elif optimizer_type == "miprov2": + print("Using MIPROv2 optimizer...") + optimizer = MIPROv2( + metric=score_translation, + auto="light" # Use light auto-configuration + ) + else: + raise ValueError(f"Unknown optimizer type: {optimizer_type}") + + print(f"Starting {optimizer_type} optimization...") + + # Run optimization + if optimizer_type == "bootstrap": + optimized_program = optimizer.compile( + student=student, + trainset=train_set # BootstrapFewShot doesn't use valset + ) + elif optimizer_type == "copro": + optimized_program = optimizer.compile( + student=student, + trainset=train_set, # COPRO doesn't use valset + eval_kwargs={} # Required parameter for COPRO + ) + else: + optimized_program = optimizer.compile( + student=student, + trainset=train_set, # Use subset for faster training + valset=val_set # Use subset for validation + ) + + print(f"{optimizer_type} optimization completed!") + + # Evaluate on full validation set + evaluator = Evaluate(devset=val_set, num_threads=1, display_progress=True) + + print("\nEvaluating original program:") + original_result = evaluator(student, metric=score_translation) + original_score = original_result if isinstance(original_result, (int, float)) else original_result.score + + print("\nEvaluating optimized program:") + optimized_result = evaluator(optimized_program, metric=score_translation) + optimized_score = optimized_result if isinstance(optimized_result, (int, float)) else optimized_result.score + + print(f"\nOptimization Results ({optimizer_type}):") + print(f"Original accuracy: {original_score:.1%}") + print(f"Optimized accuracy: {optimized_score:.1%}") + print(f"Improvement: {(optimized_score - original_score) * 100:.1f} percentage points") + + return optimized_program + + +# Example usage and testing +if __name__ == "__main__": + # Setup Gemma model through LM Studio using standard DSPy LM + import litellm + + litellm.set_verbose = False + + lm = dspy.LM( + model="openai/gemma-3-1b-it", + api_base="http://localhost:1234/v1", + api_key="dummy", + temperature=1.0, + top_p=0.95, + min_p=0.0, + frequency_penalty=1, + # max_tokens=512 + ) + dspy.configure(lm=lm) + # Load examples from CSV + examples = load_examples_from_csv() + print(f"Loaded {len(examples)} examples from CSV") + + # Choose mode: basic evaluation or GEPA optimization + import sys + + if len(sys.argv) > 1 and sys.argv[1].startswith("--optimize"): + # Parse optimizer type + optimizer_type = "bootstrap" # default + if len(sys.argv) > 2: + optimizer_type = sys.argv[2] + elif "=" in sys.argv[1]: + optimizer_type = sys.argv[1].split("=")[1] + + # Run optimization + optimized_program = run_optimization(examples, optimizer_type) + + # Save optimized program (optional) + # optimized_program.save(f'optimized_vocab_agent_{optimizer_type}.json') + + else: + # Basic evaluation + agent = dspy.Predict(DetailedVocabularyTranslation) + evaluator = Evaluate(devset=examples, num_threads=1, display_progress=True, display_table=True) + score = evaluator(agent, metric=score_translation).score + print(f"\nBaseline accuracy: {score:.1%}") + print("\nOptimization options:") + print(" python vocab_agent.py --optimize bootstrap # BootstrapFewShot (recommended)") + print(" python vocab_agent.py --optimize copro # COPRO") + print(" python vocab_agent.py --optimize miprov2 # MIPROv2") diff --git a/docs/docs/tutorials/self-contained-example/vocab_examples.csv b/docs/docs/tutorials/self-contained-example/vocab_examples.csv new file mode 100644 index 0000000000..b5f87a42c3 --- /dev/null +++ b/docs/docs/tutorials/self-contained-example/vocab_examples.csv @@ -0,0 +1,155 @@ +source_word,target_word,source_lang,target_lang,expected +chat,cat,French,English,1 +chien,cat,French,English,0 +book,livre,French,English,1 +manger,to sleep,French,English,0 +regarder,to watch,French,English,1 +apporterai,I have brought.,French,English,0 +cerceaux,skip,French,English,0 +déchirasses,slink,French,English,0 +chien,dog,French,English,1 +livre,book,French,English,1 +eau,water,French,English,1 +rouge,red,French,English,1 +manger,to eat,French,English,1 +boire,to drink,French,English,1 +dormir,to sleep,French,English,1 +courir,to run,French,English,1 +maison,house,French,English,1 +voiture,car,French,English,1 +école,school,French,English,1 +ami,friend,French,English,1 +temps,time,French,English,1 +jour,day,French,English,1 +nuit,night,French,English,1 +soleil,sun,French,English,1 +lune,moon,French,English,1 +chat,dog,French,English,0 +livre,water,French,English,0 +rouge,green,French,English,0 +manger,to run,French,English,0 +boire,to sleep,French,English,0 +dormir,to eat,French,English,0 +maison,car,French,English,0 +voiture,house,French,English,0 +école,friend,French,English,0 +temps,day,French,English,0 +manière,way,French,English,1 +ces,these,French,English,1 +pour,for,French,English,1 +voyait,saw,French,English,1 +comment,how,French,English,1 +dessus,above,French,English,1 +où,where,French,English,1 +église,church,French,English,1 +louis,louis,French,English,1 +fond,bottom,French,English,1 +soldats,soldiers,French,English,1 +an,year,French,English,1 +grand,big,French,English,1 +une,a,French,English,1 +reprit,resumed,French,English,1 +toute,all,French,English,1 +années,years,French,English,1 +elle-même,herself,French,English,1 +tard,late,French,English,1 +ils,they,French,English,1 +es,are,French,English,1 +trouvait,found,French,English,1 +seul,alone,French,English,1 +vint,came,French,English,1 +mais,but,French,English,1 +donne,gives,French,English,1 +jour,day,French,English,1 +mes,my,French,English,1 +toutes,all,French,English,1 +vérité,truth,French,English,1 +furent,were,French,English,1 +plus,more,French,English,1 +auteur,author,French,English,1 +front,forehead,French,English,1 +œil,eye,French,English,1 +français,french,French,English,1 +de,of,French,English,1 +regard,look,French,English,1 +ainsi,thus,French,English,1 +rire,laugh,French,English,1 +note,note,French,English,1 +sorte,sort,French,English,1 +première,first,French,English,1 +triste,sad,French,English,1 +autre,other,French,English,1 +chef,chief,French,English,1 +mis,put,French,English,1 +quant,as for,French,English,1 +vient,comes,French,English,1 +rendre,render,French,English,1 +manière,house,French,English,0 +ces,those,French,English,0 +pour,against,French,English,0 +voyait,heard,French,English,0 +comment,why,French,English,0 +dessus,below,French,English,0 +où,when,French,English,0 +église,hospital,French,English,0 +louis,pierre,French,English,0 +fond,top,French,English,0 +soldats,civilians,French,English,0 +an,month,French,English,0 +grand,small,French,English,0 +une,the,French,English,0 +reprit,started,French,English,0 +toute,some,French,English,0 +années,days,French,English,0 +elle-même,himself,French,English,0 +tard,early,French,English,0 +ils,we,French,English,0 +es,is,French,English,0 +trouvait,lost,French,English,0 +seul,together,French,English,0 +vint,went,French,English,0 +mais,and,French,English,0 +donne,takes,French,English,0 +mes,your,French,English,0 +toutes,some,French,English,0 +vérité,lie,French,English,0 +furent,are,French,English,0 +plus,less,French,English,0 +auteur,reader,French,English,0 +front,back,French,English,0 +œil,ear,French,English,0 +français,english,French,English,0 +de,to,French,English,0 +regard,sound,French,English,0 +ainsi,never,French,English,0 +rire,cry,French,English,0 +note,letter,French,English,0 +sorte,type,French,English,0 +première,last,French,English,0 +triste,happy,French,English,0 +autre,same,French,English,0 +chef,worker,French,English,0 +mis,took,French,English,0 +quant,because,French,English,0 +vient,goes,French,English,0 +rendre,take,French,English,0 +longtemps,long time,French,English,1 +longue,long,French,English,1 +aller,to go,French,English,1 +tour,turn,French,English,1 +tes,your,French,English,1 +main,hand,French,English,1 +femmes,women,French,English,1 +si,if,French,English,1 +faisait,was doing,French,English,1 +oncle,uncle,French,English,1 +longtemps,short time,French,English,0 +longue,short,French,English,0 +aller,to stay,French,English,0 +tour,square,French,English,0 +tes,my,French,English,0 +main,foot,French,English,0 +femmes,men,French,English,0 +si,when,French,English,0 +faisait,was sleeping,French,English,0 +oncle,aunt,French,English,0 \ No newline at end of file