From 3c5c8529d20a3f6de64c9b56acf6a14079beddf2 Mon Sep 17 00:00:00 2001
From: Crawford Collins <github@crawfordc.com>
Date: Wed, 27 Aug 2025 21:24:55 -0500
Subject: [PATCH 1/3] adding save_as_csv and save_as_json to evaluate

---
 dspy/evaluate/evaluate.py | 98 ++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 33 deletions(-)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index 721b34af3e..7142d20600 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -1,7 +1,9 @@
+import csv
 import importlib
+import json
 import logging
 import types
-from typing import TYPE_CHECKING, Any, Callable
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -27,6 +29,7 @@ def display(obj: Any):
         """
         print(obj)
 
+
     def HTML(x: str) -> str:  # noqa: N802
         """
         Obtain the HTML representation of the specified string.
@@ -36,7 +39,6 @@ def HTML(x: str) -> str:  # noqa: N802
         # available, this method will simply return the input string.
         return x
 
-
 # TODO: Counting failures and having a max_failure count. When that is exceeded (also just at the end),
 # we print the number of failures, the first N examples that failed, and the first N exceptions raised.
 
@@ -67,17 +69,19 @@ class Evaluate:
     """
 
     def __init__(
-        self,
-        *,
-        devset: list["dspy.Example"],
-        metric: Callable | None = None,
-        num_threads: int | None = None,
-        display_progress: bool = False,
-        display_table: bool | int = False,
-        max_errors: int | None = None,
-        provide_traceback: bool | None = None,
-        failure_score: float = 0.0,
-        **kwargs,
+            self,
+            *,
+            devset: list["dspy.Example"],
+            metric: Callable | None = None,
+            num_threads: int | None = None,
+            display_progress: bool = False,
+            display_table: bool | int = False,
+            max_errors: int | None = None,
+            provide_traceback: bool | None = None,
+            failure_score: float = 0.0,
+            save_as_csv=None,
+            save_as_json=None,
+            **kwargs,
     ):
         """
         Args:
@@ -100,20 +104,26 @@ def __init__(
         self.max_errors = max_errors
         self.provide_traceback = provide_traceback
         self.failure_score = failure_score
+        self.save_as_csv = save_as_csv
+        self.save_as_json = save_as_json
 
         if "return_outputs" in kwargs:
-            raise ValueError("`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object.")
+            raise ValueError(
+                "`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object.")
 
     @with_callbacks
     def __call__(
-        self,
-        program: "dspy.Module",
-        metric: Callable | None = None,
-        devset: list["dspy.Example"] | None = None,
-        num_threads: int | None = None,
-        display_progress: bool | None = None,
-        display_table: bool | int | None = None,
-        callback_metadata: dict[str, Any] | None = None,
+            self,
+            program: "dspy.Module",
+            metric: Callable | None = None,
+            devset: list["dspy.Example"] | None = None,
+            num_threads: int | None = None,
+            display_progress: bool | None = None,
+            display_table: bool | int | None = None,
+            callback_metadata: dict[str, Any] | None = None,
+            save_as_csv: Optional[str] = None,
+            save_as_json: Optional[str] = None,
+
     ) -> EvaluationResult:
         """
         Args:
@@ -140,6 +150,8 @@ def __call__(
         num_threads = num_threads if num_threads is not None else self.num_threads
         display_progress = display_progress if display_progress is not None else self.display_progress
         display_table = display_table if display_table is not None else self.display_table
+        save_as_csv = save_as_csv if save_as_csv is not None else self.save_as_csv
+        save_as_json = save_as_json if save_as_json is not None else self.save_as_json
 
         if callback_metadata:
             logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
@@ -179,13 +191,41 @@ def process_item(example):
             else:
                 logger.warning("Skipping table display since `pandas` is not installed.")
 
+        if save_as_csv:
+            metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
+            data = self._prepare_results_output(results, metric_name)
+
+            with open(save_as_csv, 'w', newline='') as csvfile:
+                fieldnames = data[0].keys()
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+                writer.writeheader()
+                for row in data:
+                    writer.writerow(row)
+        if save_as_json:
+            metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
+            data = self._prepare_results_output(results, metric_name)
+            with open(save_as_json, 'w', ) as f:
+                json.dump(data, f)
+
         return EvaluationResult(
             score=round(100 * ncorrect / ntotal, 2),
             results=results,
         )
 
+    @staticmethod
+    def _prepare_results_output(results: list[tuple[dspy.Example, dspy.Example, Any]], metric_name: str):
+        return [
+            (
+                merge_dicts(example, prediction) | {metric_name: score}
+                if prediction_is_dictlike(prediction)
+                else dict(example) | {"prediction": prediction, metric_name: score}
+            )
+            for example, prediction, score in results
+        ]
+
     def _construct_result_table(
-        self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
+            self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
     ) -> "pd.DataFrame":
         """
         Construct a pandas DataFrame from the specified result list.
@@ -200,14 +240,7 @@ def _construct_result_table(
         """
         import pandas as pd
 
-        data = [
-            (
-                merge_dicts(example, prediction) | {"correct": score}
-                if prediction_is_dictlike(prediction)
-                else dict(example) | {"prediction": prediction, "correct": score}
-            )
-            for example, prediction, score in results
-        ]
+        data = self._prepare_results_output(results, metric_name)
 
         # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
         result_df = pd.DataFrame(data)
@@ -308,7 +341,7 @@ def display_dataframe(df: "pd.DataFrame"):
     else:
         # Pretty print the DataFrame to the console
         with pd.option_context(
-            "display.max_rows", None, "display.max_columns", None
+                "display.max_rows", None, "display.max_columns", None
         ):  # more options can be specified also
             print(df)
 
@@ -335,7 +368,6 @@ def is_in_ipython_notebook_environment():
     except ImportError:
         return False
 
-
 # FIXME: TODO: The merge_dicts stuff above is way too quick and dirty.
 # TODO: the display_table can't handle False but can handle 0!
 # Not sure how it works with True exactly, probably fails too.

From 5fc2d00858d00bcf02257a57dd1607c88fdccf67 Mon Sep 17 00:00:00 2001
From: Crawford Collins <github@crawfordc.com>
Date: Wed, 27 Aug 2025 21:32:54 -0500
Subject: [PATCH 2/3] reformat

---
 dspy/evaluate/evaluate.py | 148 +++++++++++++++++++++++++-------------
 1 file changed, 100 insertions(+), 48 deletions(-)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index 7142d20600..66f9c65606 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -3,7 +3,7 @@
 import json
 import logging
 import types
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Any, Callable
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -29,7 +29,6 @@ def display(obj: Any):
         """
         print(obj)
 
-
     def HTML(x: str) -> str:  # noqa: N802
         """
         Obtain the HTML representation of the specified string.
@@ -39,6 +38,7 @@ def HTML(x: str) -> str:  # noqa: N802
         # available, this method will simply return the input string.
         return x
 
+
 # TODO: Counting failures and having a max_failure count. When that is exceeded (also just at the end),
 # we print the number of failures, the first N examples that failed, and the first N exceptions raised.
 
@@ -54,7 +54,9 @@ class EvaluationResult(Prediction):
     - results: a list of (example, prediction, score) tuples for each example in devset
     """
 
-    def __init__(self, score: float, results: list[tuple["dspy.Example", "dspy.Example", Any]]):
+    def __init__(
+        self, score: float, results: list[tuple["dspy.Example", "dspy.Example", Any]]
+    ):
         super().__init__(score=score, results=results)
 
     def __repr__(self):
@@ -69,19 +71,19 @@ class Evaluate:
     """
 
     def __init__(
-            self,
-            *,
-            devset: list["dspy.Example"],
-            metric: Callable | None = None,
-            num_threads: int | None = None,
-            display_progress: bool = False,
-            display_table: bool | int = False,
-            max_errors: int | None = None,
-            provide_traceback: bool | None = None,
-            failure_score: float = 0.0,
-            save_as_csv=None,
-            save_as_json=None,
-            **kwargs,
+        self,
+        *,
+        devset: list["dspy.Example"],
+        metric: Callable | None = None,
+        num_threads: int | None = None,
+        display_progress: bool = False,
+        display_table: bool | int = False,
+        max_errors: int | None = None,
+        provide_traceback: bool | None = None,
+        failure_score: float = 0.0,
+        save_as_csv=None,
+        save_as_json=None,
+        **kwargs,
     ):
         """
         Args:
@@ -109,21 +111,21 @@ def __init__(
 
         if "return_outputs" in kwargs:
             raise ValueError(
-                "`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object.")
+                "`return_outputs` is no longer supported. Results are always returned inside the `results` field of the `EvaluationResult` object."
+            )
 
     @with_callbacks
     def __call__(
-            self,
-            program: "dspy.Module",
-            metric: Callable | None = None,
-            devset: list["dspy.Example"] | None = None,
-            num_threads: int | None = None,
-            display_progress: bool | None = None,
-            display_table: bool | int | None = None,
-            callback_metadata: dict[str, Any] | None = None,
-            save_as_csv: Optional[str] = None,
-            save_as_json: Optional[str] = None,
-
+        self,
+        program: "dspy.Module",
+        metric: Callable | None = None,
+        devset: list["dspy.Example"] | None = None,
+        num_threads: int | None = None,
+        display_progress: bool | None = None,
+        display_table: bool | int | None = None,
+        callback_metadata: dict[str, Any] | None = None,
+        save_as_csv: str | None = None,
+        save_as_json: str | None = None,
     ) -> EvaluationResult:
         """
         Args:
@@ -148,20 +150,30 @@ def __call__(
         metric = metric if metric is not None else self.metric
         devset = devset if devset is not None else self.devset
         num_threads = num_threads if num_threads is not None else self.num_threads
-        display_progress = display_progress if display_progress is not None else self.display_progress
-        display_table = display_table if display_table is not None else self.display_table
+        display_progress = (
+            display_progress if display_progress is not None else self.display_progress
+        )
+        display_table = (
+            display_table if display_table is not None else self.display_table
+        )
         save_as_csv = save_as_csv if save_as_csv is not None else self.save_as_csv
         save_as_json = save_as_json if save_as_json is not None else self.save_as_json
 
         if callback_metadata:
-            logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
+            logger.debug(
+                f"Evaluate is called with callback metadata: {callback_metadata}"
+            )
 
         tqdm.tqdm._instances.clear()
 
         executor = ParallelExecutor(
             num_threads=num_threads,
             disable_progress_bar=not display_progress,
-            max_errors=(self.max_errors if self.max_errors is not None else dspy.settings.max_errors),
+            max_errors=(
+                self.max_errors
+                if self.max_errors is not None
+                else dspy.settings.max_errors
+            ),
             provide_traceback=self.provide_traceback,
             compare_results=True,
         )
@@ -174,28 +186,46 @@ def process_item(example):
         results = executor.execute(process_item, devset)
         assert len(devset) == len(results)
 
-        results = [((dspy.Prediction(), self.failure_score) if r is None else r) for r in results]
-        results = [(example, prediction, score) for example, (prediction, score) in zip(devset, results, strict=False)]
+        results = [
+            ((dspy.Prediction(), self.failure_score) if r is None else r)
+            for r in results
+        ]
+        results = [
+            (example, prediction, score)
+            for example, (prediction, score) in zip(devset, results, strict=False)
+        ]
         ncorrect, ntotal = sum(score for *_, score in results), len(devset)
 
-        logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)")
+        logger.info(
+            f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)"
+        )
 
         if display_table:
             if importlib.util.find_spec("pandas") is not None:
                 # Rename the 'correct' column to the name of the metric object
-                metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
+                metric_name = (
+                    metric.__name__
+                    if isinstance(metric, types.FunctionType)
+                    else metric.__class__.__name__
+                )
                 # Construct a pandas DataFrame from the results
                 result_df = self._construct_result_table(results, metric_name)
 
                 self._display_result_table(result_df, display_table, metric_name)
             else:
-                logger.warning("Skipping table display since `pandas` is not installed.")
+                logger.warning(
+                    "Skipping table display since `pandas` is not installed."
+                )
 
         if save_as_csv:
-            metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
+            metric_name = (
+                metric.__name__
+                if isinstance(metric, types.FunctionType)
+                else metric.__class__.__name__
+            )
             data = self._prepare_results_output(results, metric_name)
 
-            with open(save_as_csv, 'w', newline='') as csvfile:
+            with open(save_as_csv, "w", newline="") as csvfile:
                 fieldnames = data[0].keys()
                 writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
@@ -203,9 +233,16 @@ def process_item(example):
                 for row in data:
                     writer.writerow(row)
         if save_as_json:
-            metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
+            metric_name = (
+                metric.__name__
+                if isinstance(metric, types.FunctionType)
+                else metric.__class__.__name__
+            )
             data = self._prepare_results_output(results, metric_name)
-            with open(save_as_json, 'w', ) as f:
+            with open(
+                save_as_json,
+                "w",
+            ) as f:
                 json.dump(data, f)
 
         return EvaluationResult(
@@ -214,7 +251,9 @@ def process_item(example):
         )
 
     @staticmethod
-    def _prepare_results_output(results: list[tuple[dspy.Example, dspy.Example, Any]], metric_name: str):
+    def _prepare_results_output(
+        results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
+    ):
         return [
             (
                 merge_dicts(example, prediction) | {metric_name: score}
@@ -225,7 +264,9 @@ def _prepare_results_output(results: list[tuple[dspy.Example, dspy.Example, Any]
         ]
 
     def _construct_result_table(
-            self, results: list[tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
+        self,
+        results: list[tuple["dspy.Example", "dspy.Example", Any]],
+        metric_name: str,
     ) -> "pd.DataFrame":
         """
         Construct a pandas DataFrame from the specified result list.
@@ -244,11 +285,17 @@ def _construct_result_table(
 
         # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
         result_df = pd.DataFrame(data)
-        result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell)
+        result_df = (
+            result_df.map(truncate_cell)
+            if hasattr(result_df, "map")
+            else result_df.applymap(truncate_cell)
+        )
 
         return result_df.rename(columns={"correct": metric_name})
 
-    def _display_result_table(self, result_df: "pd.DataFrame", display_table: bool | int, metric_name: str):
+    def _display_result_table(
+        self, result_df: "pd.DataFrame", display_table: bool | int, metric_name: str
+    ):
         """
         Display the specified result DataFrame in a table format.
 
@@ -323,7 +370,9 @@ def stylize_metric_name(df: "pd.DataFrame", metric_name: str) -> "pd.DataFrame":
     :param metric_name: The name of the metric for which to stylize DataFrame cell contents.
     """
     df[metric_name] = df[metric_name].apply(
-        lambda x: f"✔️ [{x:.3f}]" if x and isinstance(x, float) else f"✔️ [{x}]" if x else ""
+        lambda x: (
+            f"✔️ [{x:.3f}]" if x and isinstance(x, float) else f"✔️ [{x}]" if x else ""
+        )
     )
     return df
 
@@ -341,12 +390,14 @@ def display_dataframe(df: "pd.DataFrame"):
     else:
         # Pretty print the DataFrame to the console
         with pd.option_context(
-                "display.max_rows", None, "display.max_columns", None
+            "display.max_rows", None, "display.max_columns", None
         ):  # more options can be specified also
             print(df)
 
 
-def configure_dataframe_for_ipython_notebook_display(df: "pd.DataFrame") -> "pd.DataFrame":
+def configure_dataframe_for_ipython_notebook_display(
+    df: "pd.DataFrame",
+) -> "pd.DataFrame":
     """Set various pandas display options for DataFrame in an IPython notebook environment."""
     import pandas as pd
 
@@ -368,6 +419,7 @@ def is_in_ipython_notebook_environment():
     except ImportError:
         return False
 
+
 # FIXME: TODO: The merge_dicts stuff above is way too quick and dirty.
 # TODO: the display_table can't handle False but can handle 0!
 # Not sure how it works with True exactly, probably fails too.

From 0ed15aeac3392b07e85aea05a4238450b0d2b38a Mon Sep 17 00:00:00 2001
From: Crawford Collins <github@crawfordc.com>
Date: Wed, 10 Sep 2025 17:20:33 -0500
Subject: [PATCH 3/3] more whitespace fixes

---
 .../self-contained-example/README.md          |  44 +++
 .../self-contained-example/vocab_agent.py     | 300 ++++++++++++++++++
 .../self-contained-example/vocab_examples.csv | 155 +++++++++
 3 files changed, 499 insertions(+)
 create mode 100644 docs/docs/tutorials/self-contained-example/README.md
 create mode 100644 docs/docs/tutorials/self-contained-example/vocab_agent.py
 create mode 100644 docs/docs/tutorials/self-contained-example/vocab_examples.csv

diff --git a/docs/docs/tutorials/self-contained-example/README.md b/docs/docs/tutorials/self-contained-example/README.md
new file mode 100644
index 0000000000..59d9121767
--- /dev/null
+++ b/docs/docs/tutorials/self-contained-example/README.md
@@ -0,0 +1,44 @@
+# DSPy Simple Optimizer Tutorial
+
+This tutorial demonstrates DSPy optimization techniques using a French-English vocabulary translation task. The script
+includes 155 translation examples and supports three different optimizers: BootstrapFewShot, COPRO, and MIPROv2.
+
+## Requirements
+
+## Setup
+
+The script is configured to use LM Studio with a local model. Ensure LM Studio is running on `http://localhost:1234`
+with a loaded model. If you want to use your a different model, change it in the dspy configuration.
+
+```bash
+pip install dspy
+```
+
+This script uses the unsloth version of Google Gemma 1 (gemma-3-1b-it-GGUF/gemma-3-1b-it-Q8_0.gguf). Which you will
+likely need to download with the lm studio tool.
+
+## Usage
+
+### Basic Evaluation
+
+```bash
+python vocab_agent.py
+```
+
+### Optimization
+
+```bash
+# BootstrapFewShot (recommended)
+python vocab_agent.py --optimize bootstrap
+
+# COPRO
+python vocab_agent.py --optimize copro
+
+# MIPROv2
+python vocab_agent.py --optimize miprov2
+```
+
+## Dataset
+
+The `vocab_examples.csv` contains 155 French-English translation pairs with correct/incorrect labels for training and
+evaluation.
\ No newline at end of file
diff --git a/docs/docs/tutorials/self-contained-example/vocab_agent.py b/docs/docs/tutorials/self-contained-example/vocab_agent.py
new file mode 100644
index 0000000000..a313f63e07
--- /dev/null
+++ b/docs/docs/tutorials/self-contained-example/vocab_agent.py
@@ -0,0 +1,300 @@
+import csv
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import dspy
+from dspy import BootstrapFewShot, COPRO, Evaluate, MIPROv2
+
+
+# Data models for structured output
+@dataclass
+class VocabTranslationResult:
+    """Result of vocabulary translation evaluation."""
+
+    is_correct: int
+    correct_translation: str
+    explanation: str
+    target_word: str
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "is_correct": self.is_correct,
+            "correct_translation": self.correct_translation,
+            "explanation": self.explanation,
+            "target_word": self.target_word,
+        }
+
+
+# Enhanced signature with more detailed instructions
+class DetailedVocabularyTranslation(dspy.Signature):
+    """
+    Evaluate vocabulary translation as an expert French tutor.
+    Consider common synonyms and variations.
+    Don't mark wrong if user didn't provide every possible translation.
+    If the word seems non-standard or rare, mark as correct.
+    """
+
+    source_word: str = dspy.InputField(desc="Word to translate from source language")
+    target_word: str = dspy.InputField(desc="User's translation attempt")
+    source_lang: str = dspy.InputField(desc="Source language")
+    target_lang: str = dspy.InputField(desc="Target language")
+
+    is_correct: int = dspy.OutputField(desc="1 for correct, 0 for incorrect")
+    correct_translation: str = dspy.OutputField(desc="Best translation of the source word")
+    explanation: str = dspy.OutputField(desc="Why the translation is correct/incorrect")
+
+
+class VocabularyTranslationAgent(dspy.Module):
+    """DSPy module for evaluating vocabulary translations with structured output."""
+
+    def __init__(self):
+        super().__init__()
+
+        # Use TypedPredictor for structured output
+        self.evaluate = DetailedVocabularyTranslation
+
+    def forward(self, word: str, translation: str, from_french_to_english: bool = True) -> VocabTranslationResult:
+        """
+        Evaluate if a vocabulary translation is correct.
+
+        Args:
+            word: The word to translate
+            translation: The user's translation
+            from_french_to_english: Direction of translation
+
+        Returns:
+            VocabTranslationResult with evaluation details
+        """
+        # Determine source and target languages
+        if from_french_to_english:
+            source_lang = "French"
+            target_lang = "English"
+            source_word = word
+            target_word = translation
+        else:
+            source_lang = "English"
+            target_lang = "French"
+            source_word = translation
+            target_word = word
+
+        # Get structured response directly
+        result = self.evaluate(
+            source_word=source_word, target_word=target_word, source_lang=source_lang, target_lang=target_lang
+        )
+
+        return VocabTranslationResult(
+            is_correct=result.is_correct,
+            correct_translation=result.correct_translation,
+            explanation=result.explanation,
+            target_word=target_word,
+        )
+
+
+class EnhancedVocabularyAgent(dspy.Module):
+    """Enhanced vocabulary translation agent with chain of thought reasoning."""
+
+    def __init__(self, model: Optional[str] = None):
+        super().__init__()
+
+        # Use ChainOfThought for better reasoning
+        self.evaluate = dspy.ChainOfThought(DetailedVocabularyTranslation)
+
+        if model:
+            self.lm = dspy.LM(model=model)
+            dspy.configure(lm=self.lm)
+
+    def forward(self, word: str, translation: str, from_french_to_english: bool = True) -> VocabTranslationResult:
+        """Evaluate with enhanced reasoning."""
+        # Prepare languages and words
+        if from_french_to_english:
+            source_lang, target_lang = "French", "English"
+            source_word, target_word = word, translation
+        else:
+            source_lang, target_lang = "English", "French"
+            source_word, target_word = translation, word
+
+        # Evaluate with reasoning
+        result = self.evaluate(
+            source_word=source_word, target_word=target_word, source_lang=source_lang, target_lang=target_lang
+        )
+
+        return VocabTranslationResult(
+            is_correct=result.is_correct,
+            correct_translation=result.correct_translation,
+            explanation=result.explanation,
+            target_word=target_word,
+        )
+
+
+def load_examples_from_csv(csv_path: str = "vocab_examples.csv") -> List[dspy.Example]:
+    """Load vocabulary examples from CSV file."""
+    examples = []
+
+    with open(csv_path, 'r', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            example = dspy.Example(
+                source_word=row['source_word'],
+                target_word=row['target_word'],
+                source_lang=row['source_lang'],
+                target_lang=row['target_lang'],
+                expected=int(row['expected'])
+            ).with_inputs("source_word", "target_word", "source_lang", "target_lang")
+            examples.append(example)
+
+    return examples
+
+
+def score_translation(example, prediction, trace=None, pred_name=None, pred_trace=None) -> float:
+    """Score a translation result against expected outcome."""
+    # Handle different calling signatures from GEPA vs regular evaluation
+    if hasattr(prediction, 'is_correct'):
+        return example["expected"] == prediction.is_correct
+    else:
+        # For GEPA, prediction might be the raw output
+        return 0.0  # Default fallback
+
+
+def split_dataset(examples: List[dspy.Example], train_ratio: float = 0.8) -> tuple[
+    List[dspy.Example], List[dspy.Example]]:
+    """Split dataset into training and validation sets."""
+    import random
+    random.seed(42)  # For reproducible splits
+
+    shuffled = examples.copy()
+    random.shuffle(shuffled)
+
+    split_idx = int(len(shuffled) * train_ratio)
+    train_set = shuffled[:split_idx]
+    val_set = shuffled[split_idx:]
+
+    return train_set, val_set
+
+
+def run_optimization(examples: List[dspy.Example], optimizer_type: str = "bootstrap") -> dspy.Module:
+    """Run DSPy optimization with different optimizers."""
+
+    # Split dataset
+    train_set, val_set = split_dataset(examples)
+    print(f"Training set: {len(train_set)} examples")
+    print(f"Validation set: {len(val_set)} examples")
+
+    # Create student program (the one to optimize)
+    student = dspy.ChainOfThought(DetailedVocabularyTranslation)
+
+    # Setup optimizer based on type
+    if optimizer_type == "bootstrap":
+        print("Using BootstrapFewShot optimizer...")
+        optimizer = BootstrapFewShot(
+            metric=score_translation,
+            max_bootstrapped_demos=8,  # Number of examples to bootstrap
+            max_labeled_demos=4,  # Max labeled demonstrations
+        )
+    elif optimizer_type == "copro":
+        print("Using COPRO optimizer...")
+        optimizer = COPRO(
+            metric=score_translation,
+            breadth=10,
+            depth=3,
+            init_temperature=1.0
+        )
+    elif optimizer_type == "miprov2":
+        print("Using MIPROv2 optimizer...")
+        optimizer = MIPROv2(
+            metric=score_translation,
+            auto="light"  # Use light auto-configuration
+        )
+    else:
+        raise ValueError(f"Unknown optimizer type: {optimizer_type}")
+
+    print(f"Starting {optimizer_type} optimization...")
+
+    # Run optimization
+    if optimizer_type == "bootstrap":
+        optimized_program = optimizer.compile(
+            student=student,
+            trainset=train_set  # BootstrapFewShot doesn't use valset
+        )
+    elif optimizer_type == "copro":
+        optimized_program = optimizer.compile(
+            student=student,
+            trainset=train_set,  # COPRO doesn't use valset
+            eval_kwargs={}  # Required parameter for COPRO
+        )
+    else:
+        optimized_program = optimizer.compile(
+            student=student,
+            trainset=train_set,  # Use subset for faster training
+            valset=val_set  # Use subset for validation
+        )
+
+    print(f"{optimizer_type} optimization completed!")
+
+    # Evaluate on full validation set
+    evaluator = Evaluate(devset=val_set, num_threads=1, display_progress=True)
+
+    print("\nEvaluating original program:")
+    original_result = evaluator(student, metric=score_translation)
+    original_score = original_result if isinstance(original_result, (int, float)) else original_result.score
+
+    print("\nEvaluating optimized program:")
+    optimized_result = evaluator(optimized_program, metric=score_translation)
+    optimized_score = optimized_result if isinstance(optimized_result, (int, float)) else optimized_result.score
+
+    print(f"\nOptimization Results ({optimizer_type}):")
+    print(f"Original accuracy: {original_score:.1%}")
+    print(f"Optimized accuracy: {optimized_score:.1%}")
+    print(f"Improvement: {(optimized_score - original_score) * 100:.1f} percentage points")
+
+    return optimized_program
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    # Setup Gemma model through LM Studio using standard DSPy LM
+    import litellm
+
+    litellm.set_verbose = False
+
+    lm = dspy.LM(
+        model="openai/gemma-3-1b-it",
+        api_base="http://localhost:1234/v1",
+        api_key="dummy",
+        temperature=1.0,
+        top_p=0.95,
+        min_p=0.0,
+        frequency_penalty=1,
+        # max_tokens=512
+    )
+    dspy.configure(lm=lm)
+    # Load examples from CSV
+    examples = load_examples_from_csv()
+    print(f"Loaded {len(examples)} examples from CSV")
+
+    # Choose mode: basic evaluation or GEPA optimization
+    import sys
+
+    if len(sys.argv) > 1 and sys.argv[1].startswith("--optimize"):
+        # Parse optimizer type
+        optimizer_type = "bootstrap"  # default
+        if len(sys.argv) > 2:
+            optimizer_type = sys.argv[2]
+        elif "=" in sys.argv[1]:
+            optimizer_type = sys.argv[1].split("=")[1]
+
+        # Run optimization
+        optimized_program = run_optimization(examples, optimizer_type)
+
+        # Save optimized program (optional)
+        # optimized_program.save(f'optimized_vocab_agent_{optimizer_type}.json')
+
+    else:
+        # Basic evaluation
+        agent = dspy.Predict(DetailedVocabularyTranslation)
+        evaluator = Evaluate(devset=examples, num_threads=1, display_progress=True, display_table=True)
+        score = evaluator(agent, metric=score_translation).score
+        print(f"\nBaseline accuracy: {score:.1%}")
+        print("\nOptimization options:")
+        print("  python vocab_agent.py --optimize bootstrap  # BootstrapFewShot (recommended)")
+        print("  python vocab_agent.py --optimize copro      # COPRO")
+        print("  python vocab_agent.py --optimize miprov2    # MIPROv2")
diff --git a/docs/docs/tutorials/self-contained-example/vocab_examples.csv b/docs/docs/tutorials/self-contained-example/vocab_examples.csv
new file mode 100644
index 0000000000..b5f87a42c3
--- /dev/null
+++ b/docs/docs/tutorials/self-contained-example/vocab_examples.csv
@@ -0,0 +1,155 @@
+source_word,target_word,source_lang,target_lang,expected
+chat,cat,French,English,1
+chien,cat,French,English,0
+book,livre,French,English,1
+manger,to sleep,French,English,0
+regarder,to watch,French,English,1
+apporterai,I have brought.,French,English,0
+cerceaux,skip,French,English,0
+déchirasses,slink,French,English,0
+chien,dog,French,English,1
+livre,book,French,English,1
+eau,water,French,English,1
+rouge,red,French,English,1
+manger,to eat,French,English,1
+boire,to drink,French,English,1
+dormir,to sleep,French,English,1
+courir,to run,French,English,1
+maison,house,French,English,1
+voiture,car,French,English,1
+école,school,French,English,1
+ami,friend,French,English,1
+temps,time,French,English,1
+jour,day,French,English,1
+nuit,night,French,English,1
+soleil,sun,French,English,1
+lune,moon,French,English,1
+chat,dog,French,English,0
+livre,water,French,English,0
+rouge,green,French,English,0
+manger,to run,French,English,0
+boire,to sleep,French,English,0
+dormir,to eat,French,English,0
+maison,car,French,English,0
+voiture,house,French,English,0
+école,friend,French,English,0
+temps,day,French,English,0
+manière,way,French,English,1
+ces,these,French,English,1
+pour,for,French,English,1
+voyait,saw,French,English,1
+comment,how,French,English,1
+dessus,above,French,English,1
+où,where,French,English,1
+église,church,French,English,1
+louis,louis,French,English,1
+fond,bottom,French,English,1
+soldats,soldiers,French,English,1
+an,year,French,English,1
+grand,big,French,English,1
+une,a,French,English,1
+reprit,resumed,French,English,1
+toute,all,French,English,1
+années,years,French,English,1
+elle-même,herself,French,English,1
+tard,late,French,English,1
+ils,they,French,English,1
+es,are,French,English,1
+trouvait,found,French,English,1
+seul,alone,French,English,1
+vint,came,French,English,1
+mais,but,French,English,1
+donne,gives,French,English,1
+jour,day,French,English,1
+mes,my,French,English,1
+toutes,all,French,English,1
+vérité,truth,French,English,1
+furent,were,French,English,1
+plus,more,French,English,1
+auteur,author,French,English,1
+front,forehead,French,English,1
+œil,eye,French,English,1
+français,french,French,English,1
+de,of,French,English,1
+regard,look,French,English,1
+ainsi,thus,French,English,1
+rire,laugh,French,English,1
+note,note,French,English,1
+sorte,sort,French,English,1
+première,first,French,English,1
+triste,sad,French,English,1
+autre,other,French,English,1
+chef,chief,French,English,1
+mis,put,French,English,1
+quant,as for,French,English,1
+vient,comes,French,English,1
+rendre,render,French,English,1
+manière,house,French,English,0
+ces,those,French,English,0
+pour,against,French,English,0
+voyait,heard,French,English,0
+comment,why,French,English,0
+dessus,below,French,English,0
+où,when,French,English,0
+église,hospital,French,English,0
+louis,pierre,French,English,0
+fond,top,French,English,0
+soldats,civilians,French,English,0
+an,month,French,English,0
+grand,small,French,English,0
+une,the,French,English,0
+reprit,started,French,English,0
+toute,some,French,English,0
+années,days,French,English,0
+elle-même,himself,French,English,0
+tard,early,French,English,0
+ils,we,French,English,0
+es,is,French,English,0
+trouvait,lost,French,English,0
+seul,together,French,English,0
+vint,went,French,English,0
+mais,and,French,English,0
+donne,takes,French,English,0
+mes,your,French,English,0
+toutes,some,French,English,0
+vérité,lie,French,English,0
+furent,are,French,English,0
+plus,less,French,English,0
+auteur,reader,French,English,0
+front,back,French,English,0
+œil,ear,French,English,0
+français,english,French,English,0
+de,to,French,English,0
+regard,sound,French,English,0
+ainsi,never,French,English,0
+rire,cry,French,English,0
+note,letter,French,English,0
+sorte,type,French,English,0
+première,last,French,English,0
+triste,happy,French,English,0
+autre,same,French,English,0
+chef,worker,French,English,0
+mis,took,French,English,0
+quant,because,French,English,0
+vient,goes,French,English,0
+rendre,take,French,English,0
+longtemps,long time,French,English,1
+longue,long,French,English,1
+aller,to go,French,English,1
+tour,turn,French,English,1
+tes,your,French,English,1
+main,hand,French,English,1
+femmes,women,French,English,1
+si,if,French,English,1
+faisait,was doing,French,English,1
+oncle,uncle,French,English,1
+longtemps,short time,French,English,0
+longue,short,French,English,0
+aller,to stay,French,English,0
+tour,square,French,English,0
+tes,my,French,English,0
+main,foot,French,English,0
+femmes,men,French,English,0
+si,when,French,English,0
+faisait,was sleeping,French,English,0
+oncle,aunt,French,English,0
\ No newline at end of file