Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions examples/k_module_problem/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,25 @@ This establishes the "no learning" baseline. Any method that beats this is demon

**Key insight**: While OpenEvolve takes more iterations on average (52.3 vs 13), it has a **100% success rate** compared to iterative refinement's 33%. The evolutionary approach's population diversity ensures it eventually escapes local optima that trap single-trajectory methods.

### Rich Feedback Mode: Proving Attribution Matters

To verify that feedback attribution is the key factor, we added a `RICH_FEEDBACK=1` mode that tells the agent exactly which modules are correct/incorrect:

```bash
RICH_FEEDBACK=1 python run_iterative_trials.py --trials 3 --iterations 100
```

| Method | Success Rate | Avg Iterations |
|--------|-------------|----------------|
| **Iterative (no feedback)** | 33% | 13 (when found) |
| **Iterative (rich feedback)** | **100%** | **3** |

With rich feedback, iterative refinement achieves **100% success rate in only 3 iterations** - dramatically faster than OpenEvolve's 52 iterations! This proves that:

1. **Feedback attribution is the key factor**, not the optimization method
2. When feedback is attributable, iterative refinement is highly effective
3. Evolution is necessary when feedback is NOT attributable (you can't tell which component is wrong)

## Why This Matters

This example illustrates when you should prefer evolutionary approaches:
Expand Down
32 changes: 30 additions & 2 deletions examples/k_module_problem/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,21 @@
This creates a challenging landscape for iterative refinement but
allows evolutionary crossover to combine good "building blocks"
from different individuals.

Set RICH_FEEDBACK=1 to enable rich feedback mode, which tells you
exactly which modules are correct/incorrect. This demonstrates that
iterative refinement works well when feedback is attributable.
"""

import os
import sys
import time
import traceback
import importlib.util

# Rich feedback mode - when enabled, reveals which modules are correct
RICH_FEEDBACK = os.environ.get("RICH_FEEDBACK", "0") == "1"

# The correct solution (hidden from the optimizer)
# This represents the "optimal" pipeline configuration discovered through
# extensive testing/domain expertise
Expand Down Expand Up @@ -141,14 +149,34 @@ def score_config(config: dict) -> tuple:

def build_artifacts(config: dict, correct_count: int, module_results: dict, eval_time: float) -> dict:
"""
Build artifacts that provide useful feedback without revealing
exactly which modules are correct.
Build artifacts that provide useful feedback.

In normal mode: Only reveals how many modules are correct, not which ones.
In rich feedback mode (RICH_FEEDBACK=1): Reveals exactly which modules are correct/incorrect.
"""
artifacts = {}

# Configuration summary
artifacts["configuration"] = str(config)

# Rich feedback mode - reveals which modules are correct/incorrect
if RICH_FEEDBACK:
correct_modules = [m for m, is_correct in module_results.items() if is_correct]
incorrect_modules = [m for m, is_correct in module_results.items() if not is_correct]

artifacts["module_feedback"] = {
"correct": correct_modules,
"incorrect": incorrect_modules,
}

if incorrect_modules:
hints = []
for module in incorrect_modules:
hints.append(f"'{module}' is WRONG - try a different option from {VALID_OPTIONS[module]}")
artifacts["actionable_hints"] = hints
else:
artifacts["actionable_hints"] = ["All modules are correct!"]

# Score feedback - tells you how many are correct, but not which ones
if correct_count == NUM_MODULES:
artifacts["status"] = "PERFECT! All modules correctly configured!"
Expand Down
27 changes: 26 additions & 1 deletion examples/k_module_problem/iterative_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,26 @@ def write_program(program_path: str, code: str) -> None:
f.write(code)


def format_rich_feedback(artifacts: dict) -> str:
"""Format rich feedback if available (RICH_FEEDBACK=1)."""
if "module_feedback" not in artifacts:
return ""

feedback = artifacts["module_feedback"]
hints = artifacts.get("actionable_hints", [])

result = "\n## DETAILED MODULE FEEDBACK (Rich Feedback Mode)\n"
result += f"- CORRECT modules: {feedback.get('correct', [])}\n"
result += f"- INCORRECT modules: {feedback.get('incorrect', [])}\n"

if hints:
result += "\n### Actionable Hints:\n"
for hint in hints:
result += f"- {hint}\n"

return result


def create_improvement_prompt(
current_code: str,
metrics: dict,
Expand Down Expand Up @@ -108,6 +128,7 @@ def create_improvement_prompt(
- Score: {metrics.get('combined_score', 0):.2%}
- Status: {artifacts.get('status', 'N/A')}
- Suggestion: {artifacts.get('suggestion', 'N/A')}
{format_rich_feedback(artifacts)}
{history_str}

## Your Task
Expand Down Expand Up @@ -205,7 +226,11 @@ def run_iterative_refinement(

# Evaluate current program
eval_result = evaluate(str(current_program_path))
metrics = eval_result.get("metrics", {})
# Handle both flat (success) and nested (error) return formats
if "metrics" in eval_result:
metrics = eval_result["metrics"]
else:
metrics = {k: v for k, v in eval_result.items() if k != "artifacts"}
artifacts = eval_result.get("artifacts", {})

score = metrics.get("combined_score", 0)
Expand Down