diff --git a/cascadeflow/agent.py b/cascadeflow/agent.py index eaac58f0..f202f196 100644 --- a/cascadeflow/agent.py +++ b/cascadeflow/agent.py @@ -96,6 +96,7 @@ # Phase 2B + v2.5: Telemetry module imports (with CostCalculator) from .telemetry import CallbackManager, CostCalculator, MetricsCollector +from .integrations.litellm import LITELLM_AVAILABLE, LiteLLMCostProvider logger = logging.getLogger(__name__) @@ -1650,6 +1651,9 @@ async def _execute_cascade_with_timing( timing["quality_verification"] = result.metadata.get("quality_check_ms", 0) timing["verifier_generation"] = result.metadata.get("verifier_latency_ms", 0) timing["cascade_overhead"] = result.metadata.get("cascade_overhead_ms", 0) + timing["tool_complexity_analysis_ms"] = result.metadata.get( + "tool_complexity_analysis_ms", 0 + ) else: timing["cascade_total"] = cascade_total @@ -1714,8 +1718,33 @@ async def _execute_direct_with_timing( ) direct_latency = (time.time() - direct_start) * 1000 - tokens_used = response.tokens_used if hasattr(response, "tokens_used") else max_tokens - cost = best_model.cost * (tokens_used / 1000) + prompt_tokens = None + completion_tokens = None + total_tokens = None + if hasattr(response, "metadata") and response.metadata: + prompt_tokens = response.metadata.get("prompt_tokens") + completion_tokens = response.metadata.get("completion_tokens") + total_tokens = response.metadata.get("total_tokens") + if total_tokens is None and prompt_tokens is not None and completion_tokens is not None: + total_tokens = prompt_tokens + completion_tokens + + cost = None + if LITELLM_AVAILABLE and prompt_tokens is not None and completion_tokens is not None: + try: + provider = LiteLLMCostProvider() + cost = provider.calculate_cost( + model=best_model.name, + input_tokens=prompt_tokens, + output_tokens=completion_tokens, + ) + except Exception as exc: + logger.warning(f"LiteLLM direct cost failed for {best_model.name}: {exc}") + + if cost is None: + tokens_used = response.tokens_used if hasattr(response, "tokens_used") else None + if not tokens_used: + tokens_used = total_tokens if total_tokens is not None else max_tokens + cost = best_model.cost * (tokens_used / 1000) result = self._create_direct_result( response.content, @@ -1724,6 +1753,9 @@ async def _execute_direct_with_timing( direct_latency, reason, tool_calls=getattr(response, "tool_calls", None), + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, ) timing = { @@ -1732,6 +1764,7 @@ async def _execute_direct_with_timing( "quality_verification": 0, "verifier_generation": 0, "cascade_overhead": 0, + "tool_complexity_analysis_ms": 0.0, } return result, timing @@ -2132,7 +2165,18 @@ def _build_cascade_result( # HELPER METHODS - WITH TOOL SUPPORT # ======================================================================== - def _create_direct_result(self, content, model, cost, latency, reason, tool_calls=None): + def _create_direct_result( + self, + content, + model, + cost, + latency, + reason, + tool_calls=None, + prompt_tokens: Optional[int] = None, + completion_tokens: Optional[int] = None, + total_tokens: Optional[int] = None, + ): """ Create result object for direct routing with tool support. @@ -2142,7 +2186,18 @@ def _create_direct_result(self, content, model, cost, latency, reason, tool_call class DirectResult: """Mimics cascade results for telemetry compatibility.""" - def __init__(self, content, model, cost, latency, reason, tool_calls=None): + def __init__( + self, + content, + model, + cost, + latency, + reason, + tool_calls=None, + prompt_tokens: Optional[int] = None, + completion_tokens: Optional[int] = None, + total_tokens: Optional[int] = None, + ): # Core attributes self.content = content self.model_used = model @@ -2158,6 +2213,16 @@ def __init__(self, content, model, cost, latency, reason, tool_calls=None): self.verifier_confidence = 0.95 self.speedup = 1.0 + token_total = total_tokens + if ( + token_total is None + and prompt_tokens is not None + and completion_tokens is not None + ): + token_total = prompt_tokens + completion_tokens + if token_total is None: + token_total = int(len(content.split()) * 1.3) + # Complete metadata self.metadata = { "reason": reason, @@ -2184,10 +2249,12 @@ def __init__(self, content, model, cost, latency, reason, tool_calls=None): "quality_threshold": None, "quality_check_passed": None, "rejection_reason": None, + "prompt_tokens": prompt_tokens or 0, + "completion_tokens": completion_tokens or 0, + "total_tokens": token_total, "tokens_generated": int(len(content.split()) * 1.3), - "total_tokens": int(len(content.split()) * 1.3), "draft_tokens": 0, - "verifier_tokens": int(len(content.split()) * 1.3), + "verifier_tokens": token_total, "draft_model": None, "verifier_model": model, "tool_calls": tool_calls, @@ -2216,7 +2283,17 @@ def to_dict(self): "metadata": self.metadata, } - return DirectResult(content, model, cost, latency, reason, tool_calls) + return DirectResult( + content, + model, + cost, + latency, + reason, + tool_calls, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ) def _dict_to_result(self, data): """Convert dict to result object.""" diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md index f38c9116..b3f9abd5 100644 --- a/tests/benchmarks/README.md +++ b/tests/benchmarks/README.md @@ -8,6 +8,7 @@ Professional benchmarks to validate CascadeFlow performance across real-world us 2. **Bitext Customer Support** - Customer service Q&A (27,000+ examples) 3. **Banking77** - Banking intent classification (13,000+ examples) 4. **GSM8K** - Grade school math reasoning (8,500+ problems) +5. **ToolCalls Real-World** - Tool routing with multi-turn context #### Metrics @@ -16,6 +17,7 @@ Each benchmark measures: - **Quality maintenance** (accuracy/pass rate) - **Latency** improvements - **Escalation rates** (drafter acceptance %) +- **Direct routing** counts and **cascade overhead** latency #### Running Benchmarks diff --git a/tests/benchmarks/banking77_benchmark.py b/tests/benchmarks/banking77_benchmark.py index c94483d4..6261c5f6 100644 --- a/tests/benchmarks/banking77_benchmark.py +++ b/tests/benchmarks/banking77_benchmark.py @@ -388,12 +388,19 @@ async def run_cascade(self, query: str) -> dict[str, Any]: "model_used": result.model_used, "accepted": result.draft_accepted, "quality_score": result.quality_score, + "routing_strategy": result.routing_strategy, "drafter_cost": result.draft_cost, "verifier_cost": result.verifier_cost, "total_cost": result.total_cost, "latency_ms": latency_ms, - "tokens_input": 0, - "tokens_output": 0, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), + "tokens_input": result.metadata.get("prompt_tokens", 0), + "tokens_output": result.metadata.get("completion_tokens", 0), } diff --git a/tests/benchmarks/base.py b/tests/benchmarks/base.py index bc7641b8..a94d5a3e 100644 --- a/tests/benchmarks/base.py +++ b/tests/benchmarks/base.py @@ -22,6 +22,7 @@ class BenchmarkResult: model_used: str # "drafter" or "verifier" accepted: bool # True if drafter accepted quality_score: float # 0-1 quality score from verifier + routing_strategy: str # "cascade" or "direct" # Cost metrics (in USD) drafter_cost: float @@ -31,6 +32,7 @@ class BenchmarkResult: # Performance metrics latency_ms: float + cascadeflow_latency_ms: float tokens_input: int tokens_output: int @@ -46,7 +48,17 @@ class BenchmarkResult: @property def escalated(self) -> bool: """True if query was escalated to verifier.""" - return not self.accepted + return self.verifier_rejected + + @property + def direct_routed(self) -> bool: + """True if query was routed directly (no cascade).""" + return self.routing_strategy == "direct" + + @property + def verifier_rejected(self) -> bool: + """True if draft was rejected and verifier was used.""" + return self.routing_strategy == "cascade" and not self.accepted @property def cost_savings(self) -> float: @@ -76,6 +88,8 @@ class BenchmarkSummary: escalated_to_verifier: int acceptance_rate_pct: float escalation_rate_pct: float + direct_routed: int + direct_routing_pct: float # Cost metrics total_cost: float @@ -88,11 +102,13 @@ class BenchmarkSummary: avg_latency_ms: float median_latency_ms: float p95_latency_ms: float + avg_cascadeflow_latency_ms: float # Quality metrics accuracy: float # Percentage of correct predictions drafter_accuracy: float # Accuracy when drafter was used verifier_accuracy: float # Accuracy when verifier was used + direct_accuracy: float # Accuracy when routed directly # Token usage total_input_tokens: int @@ -266,6 +282,12 @@ async def run(self) -> BenchmarkSummary: cascade_result["prediction"], ground_truth ) + routing_strategy = cascade_result.get("routing_strategy") + if not routing_strategy: + routing_strategy = ( + "direct" if cascade_result.get("direct_routed") else "cascade" + ) + tokens_input = cascade_result["tokens_input"] tokens_output = cascade_result["tokens_output"] if tokens_input == 0: @@ -293,11 +315,13 @@ async def run(self) -> BenchmarkSummary: model_used=cascade_result["model_used"], accepted=cascade_result["accepted"], quality_score=cascade_result["quality_score"], + routing_strategy=routing_strategy, drafter_cost=cascade_result["drafter_cost"], verifier_cost=cascade_result["verifier_cost"], total_cost=cascade_result["total_cost"], baseline_cost=baseline_cost, latency_ms=cascade_result["latency_ms"], + cascadeflow_latency_ms=cascade_result.get("cascadeflow_latency_ms", 0.0), tokens_input=tokens_input, tokens_output=tokens_output, ground_truth=ground_truth, @@ -322,11 +346,13 @@ async def run(self) -> BenchmarkSummary: model_used="error", accepted=False, quality_score=0.0, + routing_strategy="cascade", drafter_cost=0.0, verifier_cost=0.0, total_cost=0.0, baseline_cost=0.0, latency_ms=0.0, + cascadeflow_latency_ms=0.0, tokens_input=0, tokens_output=0, ground_truth=ground_truth, @@ -362,6 +388,8 @@ def _generate_summary(self) -> BenchmarkSummary: escalated_to_verifier=0, acceptance_rate_pct=0.0, escalation_rate_pct=0.0, + direct_routed=0, + direct_routing_pct=0.0, total_cost=0.0, total_baseline_cost=0.0, total_savings=0.0, @@ -370,9 +398,11 @@ def _generate_summary(self) -> BenchmarkSummary: avg_latency_ms=0.0, median_latency_ms=0.0, p95_latency_ms=0.0, + avg_cascadeflow_latency_ms=0.0, accuracy=0.0, drafter_accuracy=0.0, verifier_accuracy=0.0, + direct_accuracy=0.0, total_input_tokens=0, total_output_tokens=0, avg_input_tokens=0.0, @@ -398,8 +428,12 @@ def _generate_summary(self) -> BenchmarkSummary: result.baseline_cost = max(result.baseline_cost, baseline_cost) # Cascade metrics - drafter_accepted = sum(1 for r in valid_results if r.accepted) - escalated = sum(1 for r in valid_results if r.escalated) + direct_routed = sum(1 for r in valid_results if r.direct_routed) + drafter_accepted = sum( + 1 for r in valid_results if r.routing_strategy == "cascade" and r.accepted + ) + escalated = sum(1 for r in valid_results if r.verifier_rejected) + cascade_total = drafter_accepted + escalated # Cost metrics total_cost = sum(r.total_cost for r in valid_results) @@ -413,23 +447,35 @@ def _generate_summary(self) -> BenchmarkSummary: median_latency = latencies[len(latencies) // 2] p95_idx = int(len(latencies) * 0.95) p95_latency = latencies[p95_idx] + cascadeflow_latencies = [r.cascadeflow_latency_ms for r in valid_results] + avg_cascadeflow_latency = ( + sum(cascadeflow_latencies) / len(cascadeflow_latencies) + if cascadeflow_latencies + else 0.0 + ) # Quality metrics correct = sum(1 for r in valid_results if r.is_correct) accuracy = (correct / len(valid_results) * 100) if valid_results else 0.0 - drafter_results = [r for r in valid_results if r.accepted] + drafter_results = [ + r for r in valid_results if r.routing_strategy == "cascade" and r.accepted + ] drafter_correct = sum(1 for r in drafter_results if r.is_correct) drafter_accuracy = ( (drafter_correct / len(drafter_results) * 100) if drafter_results else 0.0 ) - verifier_results = [r for r in valid_results if r.escalated] + verifier_results = [r for r in valid_results if r.verifier_rejected] verifier_correct = sum(1 for r in verifier_results if r.is_correct) verifier_accuracy = ( (verifier_correct / len(verifier_results) * 100) if verifier_results else 0.0 ) + direct_results = [r for r in valid_results if r.direct_routed] + direct_correct = sum(1 for r in direct_results if r.is_correct) + direct_accuracy = (direct_correct / len(direct_results) * 100) if direct_results else 0.0 + # Token usage total_input = sum(r.tokens_input for r in valid_results) total_output = sum(r.tokens_output for r in valid_results) @@ -441,8 +487,12 @@ def _generate_summary(self) -> BenchmarkSummary: failed_tests=failed, drafter_accepted=drafter_accepted, escalated_to_verifier=escalated, - acceptance_rate_pct=(drafter_accepted / successful * 100) if successful > 0 else 0.0, - escalation_rate_pct=(escalated / successful * 100) if successful > 0 else 0.0, + acceptance_rate_pct=( + (drafter_accepted / cascade_total * 100) if cascade_total > 0 else 0.0 + ), + escalation_rate_pct=(escalated / cascade_total * 100) if cascade_total > 0 else 0.0, + direct_routed=direct_routed, + direct_routing_pct=(direct_routed / successful * 100) if successful > 0 else 0.0, total_cost=total_cost, total_baseline_cost=total_baseline, total_savings=total_savings, @@ -451,9 +501,11 @@ def _generate_summary(self) -> BenchmarkSummary: avg_latency_ms=avg_latency, median_latency_ms=median_latency, p95_latency_ms=p95_latency, + avg_cascadeflow_latency_ms=avg_cascadeflow_latency, accuracy=accuracy, drafter_accuracy=drafter_accuracy, verifier_accuracy=verifier_accuracy, + direct_accuracy=direct_accuracy, total_input_tokens=total_input, total_output_tokens=total_output, avg_input_tokens=total_input / successful if successful > 0 else 0.0, @@ -478,6 +530,7 @@ def _print_summary(self, summary: BenchmarkSummary) -> None: print( f" Escalated: {summary.escalated_to_verifier} ({summary.escalation_rate_pct:.1f}%)" ) + print(f" Direct Routed: {summary.direct_routed} ({summary.direct_routing_pct:.1f}%)") print("\nCOST ANALYSIS:") print(f" Total Cost: ${summary.total_cost:.6f}") @@ -490,11 +543,13 @@ def _print_summary(self, summary: BenchmarkSummary) -> None: print(f" Avg Latency: {summary.avg_latency_ms:.0f}ms") print(f" Median Latency: {summary.median_latency_ms:.0f}ms") print(f" P95 Latency: {summary.p95_latency_ms:.0f}ms") + print(f" Avg Cascade Overhead:{summary.avg_cascadeflow_latency_ms:.0f}ms") print("\nQUALITY:") print(f" Overall Accuracy: {summary.accuracy:.1f}%") print(f" Drafter Accuracy: {summary.drafter_accuracy:.1f}%") print(f" Verifier Accuracy: {summary.verifier_accuracy:.1f}%") + print(f" Direct Accuracy: {summary.direct_accuracy:.1f}%") print("\nTOKEN USAGE:") print(f" Total Input: {summary.total_input_tokens:,}") diff --git a/tests/benchmarks/bfcl/bfcl_full_benchmark.py b/tests/benchmarks/bfcl/bfcl_full_benchmark.py index 30fb302a..04bb74f9 100644 --- a/tests/benchmarks/bfcl/bfcl_full_benchmark.py +++ b/tests/benchmarks/bfcl/bfcl_full_benchmark.py @@ -29,6 +29,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) from cascadeflow import CascadeAgent, DomainConfig, ModelConfig +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider @dataclass @@ -381,16 +382,31 @@ def _check_function_correct( response: str, expected_func: Optional[str], expected_params: Optional[dict] = None, + expected_funcs: Optional[list[str]] = None, ) -> tuple[bool, bool]: """Check if function call is correct.""" + response_lower = response.lower() + + if expected_funcs: + counts = {} + for func in expected_funcs: + func_key = func.lower() + counts[func_key] = counts.get(func_key, 0) + 1 + + for func, expected_count in counts.items(): + func_mentions = response_lower.count(func) + response_lower.count( + func.replace("_", " ") + ) + if func_mentions < expected_count: + return False, True + return True, True + found_func, found_params = self._extract_function_call(response) if expected_func is None: # No tool should be used func_correct = ( - found_func is None - or "don't need" in response.lower() - or "no tool" in response.lower() + found_func is None or "don't need" in response_lower or "no tool" in response_lower ) return func_correct, True @@ -414,6 +430,7 @@ async def run_single(self, task: dict) -> BFCLResult: tools = task["tools"] prompt = task["prompt"] expected_func = task.get("expected_function") + expected_funcs = task.get("expected_functions") expected_params = task.get("expected_params") # Format tools for prompt @@ -436,10 +453,23 @@ async def run_single(self, task: dict) -> BFCLResult: User request: {prompt}""" # Create agent + drafter_provider = resolve_model_provider(self.drafter_model) + verifier_provider = resolve_model_provider(self.verifier_model) + drafter_cost = resolve_model_cost(self.drafter_model, 0.00015) + verifier_cost = resolve_model_cost(self.verifier_model, 0.0025) + agent = CascadeAgent( models=[ - ModelConfig(name=self.drafter_model, provider="openai", cost=0.00015), - ModelConfig(name=self.verifier_model, provider="openai", cost=0.0025), + ModelConfig( + name=self.drafter_model, + provider=drafter_provider, + cost=drafter_cost, + ), + ModelConfig( + name=self.verifier_model, + provider=verifier_provider, + cost=verifier_cost, + ), ], enable_domain_detection=True, use_semantic_domains=True, @@ -452,7 +482,10 @@ async def run_single(self, task: dict) -> BFCLResult: latency_ms = (time.time() - start_time) * 1000 func_correct, params_correct = self._check_function_correct( - result.content, expected_func, expected_params + result.content, + expected_func, + expected_params, + expected_funcs, ) found_func, _ = self._extract_function_call(result.content) @@ -572,8 +605,9 @@ async def main(): parser = argparse.ArgumentParser(description="BFCL-style Function Calling Benchmark") parser.add_argument("--sample", type=int, help="Run N tasks") parser.add_argument("--full", action="store_true", help="Run all tasks") - parser.add_argument("--drafter", default="gpt-4o-mini") - parser.add_argument("--verifier", default="gpt-4o") + default_drafter, default_verifier = resolve_model_pair("gpt-4o-mini", "gpt-4o") + parser.add_argument("--drafter", default=default_drafter) + parser.add_argument("--verifier", default=default_verifier) args = parser.parse_args() diff --git a/tests/benchmarks/customer_support.py b/tests/benchmarks/customer_support.py index 7f0f48ac..53a8e0fd 100644 --- a/tests/benchmarks/customer_support.py +++ b/tests/benchmarks/customer_support.py @@ -387,10 +387,17 @@ async def run_cascade(self, query: str) -> dict[str, Any]: "model_used": result.model_used, "accepted": result.draft_accepted, "quality_score": result.quality_score or 0.0, + "routing_strategy": result.routing_strategy, "drafter_cost": result.draft_cost or 0.0, "verifier_cost": result.verifier_cost or 0.0, "total_cost": result.total_cost, "latency_ms": result.latency_ms, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), "tokens_input": result.metadata.get("prompt_tokens", 0), "tokens_output": result.metadata.get("completion_tokens", 0), } diff --git a/tests/benchmarks/gsm8k/gsm8k.py b/tests/benchmarks/gsm8k/gsm8k.py index 786f65a4..c96118b0 100644 --- a/tests/benchmarks/gsm8k/gsm8k.py +++ b/tests/benchmarks/gsm8k/gsm8k.py @@ -390,6 +390,12 @@ async def run_cascade(self, query: str) -> dict[str, Any]: "total_cost": total_cost, "baseline_cost": baseline_cost, # For accurate savings calculation "latency_ms": latency_ms, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), "tokens_input": prompt_tokens, "tokens_output": completion_tokens, "routing_strategy": routing_strategy, diff --git a/tests/benchmarks/gsm8k/gsm8k_full_benchmark.py b/tests/benchmarks/gsm8k/gsm8k_full_benchmark.py index 05259111..2207241e 100644 --- a/tests/benchmarks/gsm8k/gsm8k_full_benchmark.py +++ b/tests/benchmarks/gsm8k/gsm8k_full_benchmark.py @@ -28,6 +28,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from cascadeflow import CascadeAgent, DomainConfig, ModelConfig +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider from cascadeflow.routing.domain import Domain, DomainDetector, SemanticDomainDetector # GSM8K test set URL (from OpenAI's grade-school-math repo) @@ -354,18 +355,18 @@ async def run_cascade_benchmark( ) -> list[BenchmarkResult]: """Run cascade benchmark with given parameters.""" - # Determine provider based on model name - verifier_provider = "anthropic" if "claude" in verifier.lower() else "openai" + default_drafter, default_verifier = resolve_model_pair(drafter, verifier) + drafter = default_drafter + verifier = default_verifier - # Model costs per 1K tokens (input/output average) - # GPT-4o-mini: $0.15/$0.60 per 1M = $0.000375/1K avg - # Claude Opus 4.5: $5/$25 per 1M = $0.015/1K avg - drafter_cost = 0.000375 if "gpt-4o-mini" in drafter else 0.002 - verifier_cost = 0.015 if "opus" in verifier.lower() else 0.0025 + drafter_provider = resolve_model_provider(drafter) + verifier_provider = resolve_model_provider(verifier) + drafter_cost = resolve_model_cost(drafter, 0.000375) + verifier_cost = resolve_model_cost(verifier, 0.0025) agent = CascadeAgent( models=[ - ModelConfig(name=drafter, provider="openai", cost=drafter_cost), + ModelConfig(name=drafter, provider=drafter_provider, cost=drafter_cost), ModelConfig(name=verifier, provider=verifier_provider, cost=verifier_cost), ], enable_domain_detection=True, @@ -461,15 +462,13 @@ def analyze_results(results: list[BenchmarkResult], config: ParameterConfig) -> total_cost = sum(r.cost for r in results) avg_latency = sum(r.latency_ms for r in results) / total if total > 0 else 0 - # Calculate baseline cost (all verifier) - # Cost ratio: Claude Opus 4.5 ($15/1K avg) vs GPT-4o-mini ($0.375/1K avg) = 40x - # For OpenAI-only: GPT-4o ($10/1K avg) vs GPT-4o-mini ($0.375/1K avg) = 26.7x - verifier_drafter_ratio = 40.0 # Opus 4.5 vs GPT-4o-mini - baseline_cost = ( - total_cost - * (1.0 / (draft_accepted / total + (1 - draft_accepted / total) * verifier_drafter_ratio)) - if draft_accepted > 0 - else total_cost + # Estimate baseline cost if all queries ran on the verifier. + drafter, verifier = resolve_model_pair("gpt-4o-mini", "claude-opus-4-5-20251101") + drafter_cost = resolve_model_cost(drafter, 0.000375) + verifier_cost = resolve_model_cost(verifier, 0.0025) + verifier_drafter_ratio = verifier_cost / drafter_cost if drafter_cost > 0 else 1.0 + baseline_cost = sum( + r.cost * verifier_drafter_ratio if r.draft_accepted else r.cost for r in results ) return { @@ -607,9 +606,13 @@ async def main(): sample_problems = problems[:sample_size] # 3. Run benchmark or sweep + drafter, verifier = resolve_model_pair("gpt-4o-mini", "claude-opus-4-5-20251101") + drafter_cost = resolve_model_cost(drafter, 0.000375) + verifier_cost = resolve_model_cost(verifier, 0.0025) + print("\nšŸ“Š Using models:") - print(" Drafter: gpt-4o-mini ($0.15/$0.60 per 1M)") - print(" Verifier: claude-opus-4-5-20251101 ($5/$25 per 1M)") + print(f" Drafter: {drafter} (~${drafter_cost:.4f}/1k)") + print(f" Verifier: {verifier} (~${verifier_cost:.4f}/1k)") if args.sweep: sweep_results = await parameter_sweep( diff --git a/tests/benchmarks/humaneval/humaneval.py b/tests/benchmarks/humaneval/humaneval.py index 76afa406..1220dd5f 100644 --- a/tests/benchmarks/humaneval/humaneval.py +++ b/tests/benchmarks/humaneval/humaneval.py @@ -124,12 +124,31 @@ def load_dataset(self) -> list[tuple[str, Any]]: if self.max_samples is None: try: + import gzip import json import urllib.request - url = "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl" - with urllib.request.urlopen(url, timeout=30) as response: - payload = response.read().decode("utf-8") + urls = [ + "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl", + "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz", + ] + payload = None + last_error = None + for url in urls: + try: + with urllib.request.urlopen(url, timeout=30) as response: + raw = response.read() + if url.endswith(".gz"): + payload = gzip.decompress(raw).decode("utf-8") + else: + payload = raw.decode("utf-8") + break + except Exception as exc: + last_error = exc + + if payload is None: + raise RuntimeError(last_error or "Failed to download HumanEval dataset") + full_problems = [json.loads(line) for line in payload.splitlines() if line.strip()] return [(p["prompt"], p) for p in full_problems] except Exception as exc: @@ -207,10 +226,17 @@ async def run_cascade(self, query: str) -> dict[str, Any]: "model_used": result.model_used, "accepted": result.draft_accepted, "quality_score": result.quality_score or 0.0, + "routing_strategy": result.routing_strategy, "drafter_cost": result.draft_cost or 0.0, "verifier_cost": result.verifier_cost or 0.0, "total_cost": result.total_cost, "latency_ms": result.latency_ms, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), "tokens_input": result.metadata.get("prompt_tokens", 0), "tokens_output": result.metadata.get("completion_tokens", 0), } diff --git a/tests/benchmarks/humaneval/humaneval_full_benchmark.py b/tests/benchmarks/humaneval/humaneval_full_benchmark.py index 95074847..cffa48d6 100644 --- a/tests/benchmarks/humaneval/humaneval_full_benchmark.py +++ b/tests/benchmarks/humaneval/humaneval_full_benchmark.py @@ -30,6 +30,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from cascadeflow import CascadeAgent, DomainConfig, ModelConfig +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider # HumanEval dataset URL HUMANEVAL_URL = "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz" @@ -263,10 +264,23 @@ async def run_single(self, problem: dict) -> BenchmarkResult: original_prompt = problem["prompt"] # Create CascadeFlow agent with code domain config + drafter_provider = resolve_model_provider(self.drafter_model) + verifier_provider = resolve_model_provider(self.verifier_model) + drafter_cost = resolve_model_cost(self.drafter_model, 0.00015) + verifier_cost = resolve_model_cost(self.verifier_model, 0.015) + agent = CascadeAgent( models=[ - ModelConfig(name=self.drafter_model, provider="openai", cost=0.00015), - ModelConfig(name=self.verifier_model, provider="anthropic", cost=0.015), + ModelConfig( + name=self.drafter_model, + provider=drafter_provider, + cost=drafter_cost, + ), + ModelConfig( + name=self.verifier_model, + provider=verifier_provider, + cost=verifier_cost, + ), ], enable_domain_detection=True, use_semantic_domains=True, @@ -431,10 +445,11 @@ async def main(): parser = argparse.ArgumentParser(description="HumanEval Benchmark for CascadeFlow") parser.add_argument("--sample", type=int, help="Run on N problems (default: all)") parser.add_argument("--full", action="store_true", help="Run all 164 problems") - parser.add_argument("--drafter", type=str, default="gpt-4o-mini", help="Drafter model") - parser.add_argument( - "--verifier", type=str, default="claude-opus-4-5-20251101", help="Verifier model" + default_drafter, default_verifier = resolve_model_pair( + "gpt-4o-mini", "claude-opus-4-5-20251101" ) + parser.add_argument("--drafter", type=str, default=default_drafter, help="Drafter model") + parser.add_argument("--verifier", type=str, default=default_verifier, help="Verifier model") parser.add_argument("--threshold", type=float, default=0.50, help="Quality threshold") args = parser.parse_args() diff --git a/tests/benchmarks/longbench/longbench_full_benchmark.py b/tests/benchmarks/longbench/longbench_full_benchmark.py index 5d5093c3..af14ef7f 100644 --- a/tests/benchmarks/longbench/longbench_full_benchmark.py +++ b/tests/benchmarks/longbench/longbench_full_benchmark.py @@ -33,6 +33,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) from cascadeflow import CascadeAgent, DomainConfig, ModelConfig +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider # HuggingFace datasets - optional import try: @@ -528,10 +529,23 @@ async def run_single(self, task: dict) -> LongBenchResult: Provide a clear, direct answer based only on the information in the document.""" # Create agent + drafter_provider = resolve_model_provider(self.drafter_model) + verifier_provider = resolve_model_provider(self.verifier_model) + drafter_cost = resolve_model_cost(self.drafter_model, 0.00015) + verifier_cost = resolve_model_cost(self.verifier_model, 0.0025) + agent = CascadeAgent( models=[ - ModelConfig(name=self.drafter_model, provider="openai", cost=0.00015), - ModelConfig(name=self.verifier_model, provider="openai", cost=0.0025), + ModelConfig( + name=self.drafter_model, + provider=drafter_provider, + cost=drafter_cost, + ), + ModelConfig( + name=self.verifier_model, + provider=verifier_provider, + cost=verifier_cost, + ), ], enable_domain_detection=True, use_semantic_domains=True, @@ -702,8 +716,9 @@ async def main(): parser.add_argument("--hf", action="store_true", help="Use HuggingFace LongBench dataset") parser.add_argument("--hf-subsets", type=str, help="Comma-separated HF subsets (default: all)") parser.add_argument("--hf-per-subset", type=int, default=20, help="Max tasks per HF subset") - parser.add_argument("--drafter", default="gpt-4o-mini") - parser.add_argument("--verifier", default="gpt-4o") + default_drafter, default_verifier = resolve_model_pair("gpt-4o-mini", "gpt-4o") + parser.add_argument("--drafter", default=default_drafter) + parser.add_argument("--verifier", default=default_verifier) args = parser.parse_args() diff --git a/tests/benchmarks/metrics.py b/tests/benchmarks/metrics.py index c7e83dca..449d1e51 100644 --- a/tests/benchmarks/metrics.py +++ b/tests/benchmarks/metrics.py @@ -91,12 +91,15 @@ class QualityMetrics: overall_accuracy: float drafter_accuracy: float verifier_accuracy: float + direct_accuracy: float total_correct: int total_incorrect: int drafter_correct: int drafter_incorrect: int verifier_correct: int verifier_incorrect: int + direct_correct: int + direct_incorrect: int @staticmethod def from_results(results: list[BenchmarkResult]) -> "QualityMetrics": @@ -106,12 +109,15 @@ def from_results(results: list[BenchmarkResult]) -> "QualityMetrics": overall_accuracy=0.0, drafter_accuracy=0.0, verifier_accuracy=0.0, + direct_accuracy=0.0, total_correct=0, total_incorrect=0, drafter_correct=0, drafter_incorrect=0, verifier_correct=0, verifier_incorrect=0, + direct_correct=0, + direct_incorrect=0, ) # Overall @@ -120,7 +126,7 @@ def from_results(results: list[BenchmarkResult]) -> "QualityMetrics": overall_accuracy = (total_correct / len(results) * 100) if results else 0.0 # Drafter - drafter_results = [r for r in results if r.accepted] + drafter_results = [r for r in results if r.routing_strategy == "cascade" and r.accepted] drafter_correct = sum(1 for r in drafter_results if r.is_correct) drafter_incorrect = len(drafter_results) - drafter_correct drafter_accuracy = ( @@ -128,21 +134,30 @@ def from_results(results: list[BenchmarkResult]) -> "QualityMetrics": ) # Verifier - verifier_results = [r for r in results if r.escalated] + verifier_results = [r for r in results if r.verifier_rejected] verifier_correct = sum(1 for r in verifier_results if r.is_correct) verifier_incorrect = len(verifier_results) - verifier_correct verifier_accuracy = ( (verifier_correct / len(verifier_results) * 100) if verifier_results else 0.0 ) + # Direct + direct_results = [r for r in results if r.direct_routed] + direct_correct = sum(1 for r in direct_results if r.is_correct) + direct_incorrect = len(direct_results) - direct_correct + direct_accuracy = (direct_correct / len(direct_results) * 100) if direct_results else 0.0 + return QualityMetrics( overall_accuracy=overall_accuracy, drafter_accuracy=drafter_accuracy, verifier_accuracy=verifier_accuracy, + direct_accuracy=direct_accuracy, total_correct=total_correct, total_incorrect=total_incorrect, drafter_correct=drafter_correct, drafter_incorrect=drafter_incorrect, verifier_correct=verifier_correct, verifier_incorrect=verifier_incorrect, + direct_correct=direct_correct, + direct_incorrect=direct_incorrect, ) diff --git a/tests/benchmarks/mmlu/mmlu.py b/tests/benchmarks/mmlu/mmlu.py index 09d79a3b..64c41328 100644 --- a/tests/benchmarks/mmlu/mmlu.py +++ b/tests/benchmarks/mmlu/mmlu.py @@ -17,8 +17,10 @@ import time from typing import Any, Optional +from cascadeflow import CascadeAgent, ModelConfig from .base import Benchmark, BenchmarkResult, BenchmarkSummary from .benchmark_config import BenchmarkConfig, BenchmarkMode +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider class MMLUCategory: @@ -601,9 +603,6 @@ async def run_cascade(self, query: str) -> dict[str, Any]: Returns: Cascade result dict """ - # Import here to avoid circular imports - from cascadeflow import CascadeAgent, ModelConfig - # Get question data from results lookup question_data = None for qid, qdata in self.load_dataset(): @@ -618,10 +617,23 @@ async def run_cascade(self, query: str) -> dict[str, Any]: formatted_question = self._format_question(question_data) # Create agent with config + drafter_provider = resolve_model_provider(self.drafter_model) + verifier_provider = resolve_model_provider(self.verifier_model) + drafter_cost = resolve_model_cost(self.drafter_model, 0.00015) + verifier_cost = resolve_model_cost(self.verifier_model, 0.0025) + agent = CascadeAgent( models=[ - ModelConfig(name=self.drafter_model, provider="openai", cost=0.00015), - ModelConfig(name=self.verifier_model, provider="openai", cost=0.0025), + ModelConfig( + name=self.drafter_model, + provider=drafter_provider, + cost=drafter_cost, + ), + ModelConfig( + name=self.verifier_model, + provider=verifier_provider, + cost=verifier_cost, + ), ], quality_threshold=self.quality_threshold, enable_domain_detection=self.config.enable_domain_pipeline, @@ -638,12 +650,19 @@ async def run_cascade(self, query: str) -> dict[str, Any]: "model_used": "drafter" if result.model_used == self.drafter_model else "verifier", "accepted": result.model_used == self.drafter_model, "quality_score": result.metadata.get("quality_score", 0.7), - "drafter_cost": result.total_cost if result.model_used == self.drafter_model else 0.0, - "verifier_cost": result.total_cost if result.model_used != self.drafter_model else 0.0, + "routing_strategy": result.routing_strategy, + "drafter_cost": result.draft_cost or 0.0, + "verifier_cost": result.verifier_cost or 0.0, "total_cost": result.total_cost, "latency_ms": latency_ms, - "tokens_input": result.metadata.get("tokens_input", 0), - "tokens_output": result.metadata.get("tokens_output", 0), + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), + "tokens_input": result.metadata.get("prompt_tokens", 0), + "tokens_output": result.metadata.get("completion_tokens", 0), } def _format_question(self, question_data: dict) -> str: diff --git a/tests/benchmarks/mmlu/mmlu_full_benchmark.py b/tests/benchmarks/mmlu/mmlu_full_benchmark.py index f88dc3a7..542c5919 100644 --- a/tests/benchmarks/mmlu/mmlu_full_benchmark.py +++ b/tests/benchmarks/mmlu/mmlu_full_benchmark.py @@ -38,6 +38,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from cascadeflow import CascadeAgent, DomainConfig, ModelConfig +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider from cascadeflow.routing.domain import DomainDetector, SemanticDomainDetector # Retry configuration @@ -564,6 +565,7 @@ async def run_cascade_benchmark( config: ParameterConfig, drafter: str = "gpt-4o-mini", verifier: str = "claude-sonnet-4-20250514", + concurrency: int = 1, ) -> list[BenchmarkResult]: """Run cascade benchmark with given parameters. @@ -571,11 +573,11 @@ async def run_cascade_benchmark( MMLU covers: math, science, code, medical, financial, legal, general. """ - verifier_provider = "anthropic" if "claude" in verifier.lower() else "openai" - drafter_cost = 0.000375 # GPT-4o-mini avg ($0.15/$0.60 per 1M) - verifier_cost = ( - 0.015 if "opus" in verifier.lower() else 0.003 if "sonnet" in verifier.lower() else 0.0025 - ) + drafter, verifier = resolve_model_pair(drafter, verifier) + drafter_provider = resolve_model_provider(drafter) + verifier_provider = resolve_model_provider(verifier) + drafter_cost = resolve_model_cost(drafter, 0.000375) + verifier_cost = resolve_model_cost(verifier, 0.0025) # Configure all domains expected in MMLU # Each domain gets same drafter/verifier but optimized thresholds @@ -636,7 +638,7 @@ async def run_cascade_benchmark( agent = CascadeAgent( models=[ - ModelConfig(name=drafter, provider="openai", cost=drafter_cost), + ModelConfig(name=drafter, provider=drafter_provider, cost=drafter_cost), ModelConfig(name=verifier, provider=verifier_provider, cost=verifier_cost), ], enable_domain_detection=True, @@ -644,9 +646,9 @@ async def run_cascade_benchmark( domain_configs=domain_configs, ) - results = [] + results: list[Optional[BenchmarkResult]] = [None] * len(problems) - for i, problem in enumerate(problems): + async def process_problem(index: int, problem: dict) -> tuple[int, BenchmarkResult, str]: query = format_question(problem) correct_answer = problem["answer"] @@ -658,45 +660,58 @@ async def run_cascade_benchmark( predicted = extract_answer(result.content) is_correct = predicted == correct_answer - results.append( - BenchmarkResult( - query=problem["question"][:50] + "...", - subject=problem["subject"], - category=problem["category"], - answer=correct_answer, - predicted=predicted or "N/A", - correct=is_correct, - draft_accepted=result.metadata.get("draft_accepted", False), - domain_detected=result.metadata.get("detected_domain", "unknown"), - cost=result.total_cost, - latency_ms=latency, - ) + benchmark_result = BenchmarkResult( + query=problem["question"][:50] + "...", + subject=problem["subject"], + category=problem["category"], + answer=correct_answer, + predicted=predicted or "N/A", + correct=is_correct, + draft_accepted=result.metadata.get("draft_accepted", False), + domain_detected=result.metadata.get("detected_domain", "unknown"), + cost=result.total_cost, + latency_ms=latency, ) status = "āœ“" if is_correct else "āœ—" model_used = "[D]" if result.metadata.get("draft_accepted") else "[V]" - print( - f"[{i+1}/{len(problems)}] {problem['subject'][:20]:20s}: {status} {model_used} | Cost: ${result.total_cost:.4f}" + output = ( + f"[{index+1}/{len(problems)}] {problem['subject'][:20]:20s}: " + f"{status} {model_used} | Cost: ${result.total_cost:.4f}" ) - + return index, benchmark_result, output except Exception as e: - print(f"[{i+1}/{len(problems)}] {problem['subject'][:20]:20s}: āš ļø ERROR: {e}") - results.append( - BenchmarkResult( - query=problem["question"][:50] + "...", - subject=problem["subject"], - category=problem["category"], - answer=correct_answer, - predicted="ERROR", - correct=False, - draft_accepted=False, - domain_detected="error", - cost=0.0, - latency_ms=0.0, - ) + output = f"[{index+1}/{len(problems)}] {problem['subject'][:20]:20s}: āš ļø ERROR: {e}" + benchmark_result = BenchmarkResult( + query=problem["question"][:50] + "...", + subject=problem["subject"], + category=problem["category"], + answer=correct_answer, + predicted="ERROR", + correct=False, + draft_accepted=False, + domain_detected="error", + cost=0.0, + latency_ms=0.0, ) + return index, benchmark_result, output - return results + if concurrency <= 1: + for i, problem in enumerate(problems): + index, benchmark_result, output = await process_problem(i, problem) + results[index] = benchmark_result + print(output) + else: + for start in range(0, len(problems), concurrency): + batch = problems[start : start + concurrency] + batch_results = await asyncio.gather( + *(process_problem(start + offset, problem) for offset, problem in enumerate(batch)) + ) + for index, benchmark_result, output in batch_results: + results[index] = benchmark_result + print(output) + + return [result for result in results if result is not None] def analyze_results(results: list[BenchmarkResult], config: ParameterConfig) -> dict: @@ -718,13 +733,13 @@ def analyze_results(results: list[BenchmarkResult], config: ParameterConfig) -> category_stats[r.category]["draft_accepted"] += 1 category_stats[r.category]["cost"] += r.cost - # Baseline cost (all verifier) - verifier_drafter_ratio = 40.0 - baseline_cost = ( - total_cost - * (1.0 / (draft_accepted / total + (1 - draft_accepted / total) * verifier_drafter_ratio)) - if draft_accepted > 0 - else total_cost + # Baseline cost (all verifier) using model cost ratio. + drafter, verifier = resolve_model_pair("gpt-4o-mini", "claude-opus-4-5-20251101") + drafter_cost = resolve_model_cost(drafter, 0.000375) + verifier_cost = resolve_model_cost(verifier, 0.0025) + verifier_drafter_ratio = verifier_cost / drafter_cost if drafter_cost > 0 else 1.0 + baseline_cost = sum( + r.cost * verifier_drafter_ratio if r.draft_accepted else r.cost for r in results ) return { @@ -764,6 +779,12 @@ async def main(): parser.add_argument("--sample", type=int, default=0, help="Number of samples (0=all)") parser.add_argument("--full", action="store_true", help="Run full benchmark") parser.add_argument("--sweep", action="store_true", help="Run parameter sweep") + parser.add_argument( + "--concurrency", + type=int, + default=1, + help="Concurrent requests to speed up the run", + ) args = parser.parse_args() @@ -797,10 +818,13 @@ async def main(): print("\n" + "=" * 70) print("MMLU BENCHMARK") print("=" * 70) + drafter, verifier = resolve_model_pair("gpt-4o-mini", "claude-sonnet-4-20250514") + print("\nConfiguration:") - print(" Drafter: gpt-4o-mini") - print(" Verifier: claude-sonnet-4-20250514") + print(f" Drafter: {drafter}") + print(f" Verifier: {verifier}") print(f" Problems: {sample_size}") + print(f" Concurrency: {args.concurrency}") if args.sweep: thresholds = [0.50, 0.60, 0.70] @@ -809,7 +833,11 @@ async def main(): for threshold in thresholds: config = ParameterConfig(threshold=threshold) print(f"\n--- Threshold: {threshold} ---") - results = await run_cascade_benchmark(sample_problems, config) + results = await run_cascade_benchmark( + sample_problems, + config, + concurrency=args.concurrency, + ) analysis = analyze_results(results, config) all_results.append(analysis) @@ -831,7 +859,11 @@ async def main(): print(f"Cost Savings: {best['cost']['savings_pct']:.1f}%") else: config = ParameterConfig(threshold=0.60) - results = await run_cascade_benchmark(sample_problems, config) + results = await run_cascade_benchmark( + sample_problems, + config, + concurrency=args.concurrency, + ) analysis = analyze_results(results, config) print("\n" + "=" * 70) diff --git a/tests/benchmarks/mtbench/mtbench.py b/tests/benchmarks/mtbench/mtbench.py index 85ad93d7..382abd85 100644 --- a/tests/benchmarks/mtbench/mtbench.py +++ b/tests/benchmarks/mtbench/mtbench.py @@ -305,9 +305,11 @@ async def run_cascade(self, query: str) -> dict[str, Any]: total_draft_cost = 0.0 total_verifier_cost = 0.0 total_latency = 0.0 + total_cascadeflow_latency = 0.0 total_prompt_tokens = 0 total_completion_tokens = 0 drafter_accepted_count = 0 + direct_turns = 0 # Execute each turn in sequence for turn_data in turns: @@ -337,12 +339,20 @@ async def run_cascade(self, query: str) -> dict[str, Any]: total_draft_cost += result.draft_cost or 0.0 total_verifier_cost += result.verifier_cost or 0.0 total_latency += result.latency_ms + total_cascadeflow_latency += ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ) total_prompt_tokens += result.metadata.get("prompt_tokens", 0) total_completion_tokens += result.metadata.get("completion_tokens", 0) turn_used_drafter = result.draft_accepted or result.model_used == self.drafter_model if turn_used_drafter: drafter_accepted_count += 1 + if result.routing_strategy == "direct": + direct_turns += 1 print( f" Turn {turn_num}: Quality={quality:.2f}, " @@ -353,23 +363,27 @@ async def run_cascade(self, query: str) -> dict[str, Any]: num_turns = len(turns) avg_quality = total_quality / num_turns avg_latency = total_latency / num_turns + avg_cascadeflow_latency = total_cascadeflow_latency / num_turns # Baseline cost (all turns use verifier) baseline_cost = self.calculate_baseline_cost(conversation_data) accepted = drafter_accepted_count == num_turns model_used = self.drafter_model if accepted else self.verifier_model + routing_strategy = "direct" if direct_turns == num_turns else "cascade" return { "prediction": responses, "model_used": model_used, "accepted": accepted, "quality_score": avg_quality, + "routing_strategy": routing_strategy, "drafter_cost": total_draft_cost, "verifier_cost": total_verifier_cost, "total_cost": total_cost, "baseline_cost": baseline_cost, "latency_ms": avg_latency, + "cascadeflow_latency_ms": avg_cascadeflow_latency, "tokens_input": total_prompt_tokens, "tokens_output": total_completion_tokens, } diff --git a/tests/benchmarks/mtbench/mtbench_full_benchmark.py b/tests/benchmarks/mtbench/mtbench_full_benchmark.py index 8d84fb0a..9d8b1fdd 100644 --- a/tests/benchmarks/mtbench/mtbench_full_benchmark.py +++ b/tests/benchmarks/mtbench/mtbench_full_benchmark.py @@ -42,6 +42,7 @@ # CascadeFlow imports from cascadeflow import CascadeAgent, DomainConfig, ModelConfig +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider # ============================================================================= # CONSTANTS @@ -881,8 +882,10 @@ def _create_cascade_agent(self, category: str) -> tuple[CascadeAgent, str]: """ expected_domain = CATEGORY_TO_DOMAIN.get(category, "general") - # Determine verifier provider - verifier_provider = "anthropic" if "claude" in self.verifier_model else "openai" + drafter_provider = resolve_model_provider(self.drafter_model) + verifier_provider = resolve_model_provider(self.verifier_model) + drafter_cost = resolve_model_cost(self.drafter_model, 0.00015) + verifier_cost = resolve_model_cost(self.verifier_model, 0.005) # Domain-specific thresholds (learned from GSM8K benchmark) # Creative domains need lower threshold due to alignment scorer limitations @@ -913,13 +916,13 @@ def _create_cascade_agent(self, category: str) -> tuple[CascadeAgent, str]: models=[ ModelConfig( name=self.drafter_model, - provider="openai", - cost=0.00015, # $0.15 per 1M tokens (gpt-4o-mini) + provider=drafter_provider, + cost=drafter_cost, ), ModelConfig( name=self.verifier_model, provider=verifier_provider, - cost=0.005, # $5.00 per 1M tokens (claude-opus-4-5 input) + cost=verifier_cost, ), ], enable_domain_detection=True, @@ -1421,20 +1424,29 @@ async def main(): parser.add_argument("--full", action="store_true", help="Run all 80 questions") parser.add_argument("--category", type=str, help="Run specific category only") parser.add_argument("--max", type=int, help="Maximum questions to run") - parser.add_argument("--drafter", type=str, default="gpt-4o-mini", help="Drafter model") + default_drafter, default_verifier = resolve_model_pair( + "gpt-4o-mini", "claude-opus-4-5-20251101" + ) + parser.add_argument("--drafter", type=str, default=default_drafter, help="Drafter model") parser.add_argument( "--verifier", type=str, - default="claude-opus-4-5-20251101", + default=default_verifier, help="Verifier model (default: Claude Opus 4.5)", ) parser.add_argument("--output", type=str, default="mtbench_results", help="Output directory") args = parser.parse_args() - # Check API key - if not os.getenv("OPENAI_API_KEY"): - print("Error: OPENAI_API_KEY not set") - return + drafter_provider = resolve_model_provider(args.drafter) + verifier_provider = resolve_model_provider(args.verifier) + if drafter_provider == "openai" or verifier_provider == "openai": + if not os.getenv("OPENAI_API_KEY"): + print("Error: OPENAI_API_KEY not set") + return + if drafter_provider == "anthropic" or verifier_provider == "anthropic": + if not os.getenv("ANTHROPIC_API_KEY"): + print("Error: ANTHROPIC_API_KEY not set") + return benchmark = MTBenchFullBenchmark( drafter_model=args.drafter, diff --git a/tests/benchmarks/reporter.py b/tests/benchmarks/reporter.py index ff41843a..ab93ad36 100644 --- a/tests/benchmarks/reporter.py +++ b/tests/benchmarks/reporter.py @@ -52,6 +52,8 @@ def export_json( "escalated_to_verifier": summary.escalated_to_verifier, "acceptance_rate_pct": summary.acceptance_rate_pct, "escalation_rate_pct": summary.escalation_rate_pct, + "direct_routed": summary.direct_routed, + "direct_routing_pct": summary.direct_routing_pct, "total_cost": summary.total_cost, "total_baseline_cost": summary.total_baseline_cost, "total_savings": summary.total_savings, @@ -60,9 +62,11 @@ def export_json( "avg_latency_ms": summary.avg_latency_ms, "median_latency_ms": summary.median_latency_ms, "p95_latency_ms": summary.p95_latency_ms, + "avg_cascadeflow_latency_ms": summary.avg_cascadeflow_latency_ms, "accuracy": summary.accuracy, "drafter_accuracy": summary.drafter_accuracy, "verifier_accuracy": summary.verifier_accuracy, + "direct_accuracy": summary.direct_accuracy, "total_input_tokens": summary.total_input_tokens, "total_output_tokens": summary.total_output_tokens, }, @@ -72,11 +76,13 @@ def export_json( "query": r.query[:100] + "..." if len(r.query) > 100 else r.query, "model_used": r.model_used, "accepted": r.accepted, + "routing_strategy": r.routing_strategy, "quality_score": r.quality_score, "total_cost": r.total_cost, "baseline_cost": r.baseline_cost, "cost_savings_pct": r.cost_savings_pct, "latency_ms": r.latency_ms, + "cascadeflow_latency_ms": r.cascadeflow_latency_ms, "is_correct": r.is_correct, "error": r.error, } @@ -123,6 +129,7 @@ def export_csv( "test_id", "model_used", "accepted", + "routing_strategy", "quality_score", "drafter_cost", "verifier_cost", @@ -131,6 +138,7 @@ def export_csv( "cost_savings", "cost_savings_pct", "latency_ms", + "cascadeflow_latency_ms", "tokens_input", "tokens_output", "is_correct", @@ -145,6 +153,7 @@ def export_csv( r.test_id, r.model_used, r.accepted, + r.routing_strategy, f"{r.quality_score:.3f}", f"{r.drafter_cost:.6f}", f"{r.verifier_cost:.6f}", @@ -153,6 +162,7 @@ def export_csv( f"{r.cost_savings:.6f}", f"{r.cost_savings_pct:.1f}", f"{r.latency_ms:.0f}", + f"{r.cascadeflow_latency_ms:.0f}", r.tokens_input, r.tokens_output, r.is_correct, @@ -196,6 +206,7 @@ def generate_markdown_report( | Failed | {summary.failed_tests} | | **Drafter Accepted** | **{summary.drafter_accepted} ({summary.acceptance_rate_pct:.1f}%)** | | **Escalated to Verifier** | **{summary.escalated_to_verifier} ({summary.escalation_rate_pct:.1f}%)** | +| **Direct Routed** | **{summary.direct_routed} ({summary.direct_routing_pct:.1f}%)** | ## Cost Analysis @@ -217,6 +228,7 @@ def generate_markdown_report( | Average Latency | {summary.avg_latency_ms:.0f}ms | | Median Latency | {summary.median_latency_ms:.0f}ms | | P95 Latency | {summary.p95_latency_ms:.0f}ms | +| Average Cascade Overhead | {summary.avg_cascadeflow_latency_ms:.0f}ms | ## Quality @@ -225,6 +237,7 @@ def generate_markdown_report( | Overall Accuracy | {summary.accuracy:.1f}% | | Drafter Accuracy | {summary.drafter_accuracy:.1f}% | | Verifier Accuracy | {summary.verifier_accuracy:.1f}% | +| Direct Accuracy | {summary.direct_accuracy:.1f}% | ## Token Usage diff --git a/tests/benchmarks/ruler/ruler_benchmark.py b/tests/benchmarks/ruler/ruler_benchmark.py index a4062810..d67355e4 100644 --- a/tests/benchmarks/ruler/ruler_benchmark.py +++ b/tests/benchmarks/ruler/ruler_benchmark.py @@ -30,14 +30,16 @@ from cascadeflow import CascadeAgent from cascadeflow.schema.config import ModelConfig +from tests.benchmarks.utils import resolve_model_cost, resolve_model_pair, resolve_model_provider # ============================================================================ # CONFIGURATION # ============================================================================ +default_drafter, default_verifier = resolve_model_pair("gpt-4o-mini", "gpt-4o") DEFAULT_CONFIG = { - "drafter": "gpt-4o-mini", - "verifier": "gpt-4o", + "drafter": default_drafter, + "verifier": default_verifier, "threshold": 0.6, "context_lengths": [1000, 2000, 4000], # Target word counts } @@ -331,10 +333,22 @@ async def run_benchmark( print() # Create agent + drafter_provider = resolve_model_provider(config["drafter"]) + verifier_provider = resolve_model_provider(config["verifier"]) + drafter_cost = resolve_model_cost(config["drafter"], 0.00015) + verifier_cost = resolve_model_cost(config["verifier"], 0.0025) agent = CascadeAgent( models=[ - ModelConfig(name=config["drafter"], provider="openai", cost=0.00015), - ModelConfig(name=config["verifier"], provider="openai", cost=0.0025), + ModelConfig( + name=config["drafter"], + provider=drafter_provider, + cost=drafter_cost, + ), + ModelConfig( + name=config["verifier"], + provider=verifier_provider, + cost=verifier_cost, + ), ], enable_domain_detection=True, use_semantic_domains=True, diff --git a/tests/benchmarks/run_all.py b/tests/benchmarks/run_all.py index 8338ac9e..6f96bf15 100644 --- a/tests/benchmarks/run_all.py +++ b/tests/benchmarks/run_all.py @@ -173,6 +173,18 @@ async def run_all_benchmarks( print(f"āŒ Agentic structured tool-calling benchmark failed: {e}\n") results["tool_calls_agentic"] = None + # Run Real-World Tool-Calling Benchmark + try: + print("Running Real-World Tool-Calling Benchmark...") + from .tool_calls_realworld import run_tool_calls_realworld_benchmark + + tool_calls_realworld_summary = await run_tool_calls_realworld_benchmark() + results["tool_calls_realworld"] = tool_calls_realworld_summary + print("āœ… Real-world tool-calling benchmark completed\n") + except Exception as e: + print(f"āŒ Real-world tool-calling benchmark failed: {e}\n") + results["tool_calls_realworld"] = None + # Run Provider Comparison try: print("Running Provider Comparison Benchmark...") diff --git a/tests/benchmarks/tool_calls.py b/tests/benchmarks/tool_calls.py index cc36c1e7..5961579b 100644 --- a/tests/benchmarks/tool_calls.py +++ b/tests/benchmarks/tool_calls.py @@ -214,10 +214,17 @@ async def run_cascade(self, query: Any) -> dict[str, Any]: "model_used": result.model_used, "accepted": result.draft_accepted, "quality_score": result.quality_score or 0.0, + "routing_strategy": result.routing_strategy, "drafter_cost": result.draft_cost or 0.0, "verifier_cost": result.verifier_cost or 0.0, "total_cost": result.total_cost, "latency_ms": result.latency_ms, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), "tokens_input": result.metadata.get("prompt_tokens", 0), "tokens_output": result.metadata.get("completion_tokens", 0), } diff --git a/tests/benchmarks/tool_calls_agentic.py b/tests/benchmarks/tool_calls_agentic.py index 67096790..466aac5c 100644 --- a/tests/benchmarks/tool_calls_agentic.py +++ b/tests/benchmarks/tool_calls_agentic.py @@ -249,10 +249,17 @@ async def run_cascade(self, query: Any) -> dict[str, Any]: "model_used": result.model_used, "accepted": result.draft_accepted, "quality_score": result.quality_score or 0.0, + "routing_strategy": result.routing_strategy, "drafter_cost": result.draft_cost or 0.0, "verifier_cost": result.verifier_cost or 0.0, "total_cost": result.total_cost, "latency_ms": result.latency_ms, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), "tokens_input": result.metadata.get("prompt_tokens", 0), "tokens_output": result.metadata.get("completion_tokens", 0), } diff --git a/tests/benchmarks/tool_calls_realworld.py b/tests/benchmarks/tool_calls_realworld.py new file mode 100644 index 00000000..c86780e7 --- /dev/null +++ b/tests/benchmarks/tool_calls_realworld.py @@ -0,0 +1,493 @@ +"""Tool-Calling Benchmark: Real-World Tool Routing (Single + Multi-Turn). + +Validates cascadeflow tool-call routing, acceptance, and savings on realistic +product tasks with mixed schemas and multi-turn follow-ups. +""" + +import json +from typing import Any + +from cascadeflow import CascadeAgent, ModelConfig + +from .base import Benchmark + + +WEATHER_TOOL = { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather for a location and date.", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "City name"}, + "date": {"type": "string", "description": "Date or day"}, + }, + "required": ["location"], + }, + }, +} + +CURRENCY_TOOL = { + "type": "function", + "function": { + "name": "convert_currency", + "description": "Convert an amount between currencies.", + "parameters": { + "type": "object", + "properties": { + "amount": {"type": "number"}, + "from_currency": {"type": "string"}, + "to_currency": {"type": "string"}, + }, + "required": ["amount", "from_currency", "to_currency"], + }, + }, +} + +ORDER_TOOL = { + "type": "function", + "function": { + "name": "lookup_order", + "description": "Lookup order status by order ID.", + "parameters": { + "type": "object", + "properties": {"order_id": {"type": "string"}}, + "required": ["order_id"], + }, + }, +} + +UPDATE_ORDER_TOOL = { + "type": "function", + "function": { + "name": "update_order_status", + "description": "Update an order status by order ID.", + "parameters": { + "type": "object", + "properties": { + "order_id": {"type": "string"}, + "status": {"type": "string"}, + }, + "required": ["order_id", "status"], + }, + }, +} + +CALENDAR_TOOL = { + "type": "function", + "function": { + "name": "create_calendar_event", + "description": "Create a calendar event with attendees.", + "parameters": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "date": {"type": "string"}, + "time": {"type": "string"}, + "attendees": {"type": "array", "items": {"type": "string"}}, + }, + "required": ["title", "date"], + }, + }, +} + +STOCK_TOOL = { + "type": "function", + "function": { + "name": "get_stock_quote", + "description": "Get the latest stock price for a ticker.", + "parameters": { + "type": "object", + "properties": {"symbol": {"type": "string"}}, + "required": ["symbol"], + }, + }, +} + +FLIGHT_TOOL = { + "type": "function", + "function": { + "name": "book_flight", + "description": "Book a flight between two cities.", + "parameters": { + "type": "object", + "properties": { + "origin": {"type": "string"}, + "destination": {"type": "string"}, + "date": {"type": "string"}, + "passengers": {"type": "number"}, + }, + "required": ["origin", "destination", "date"], + }, + }, +} + +PROFILE_TOOL = { + "type": "function", + "function": { + "name": "get_user_profile", + "description": "Fetch a user profile by ID.", + "parameters": { + "type": "object", + "properties": {"user_id": {"type": "string"}}, + "required": ["user_id"], + }, + }, +} + +HELP_TOOL = { + "type": "function", + "function": { + "name": "search_help_center", + "description": "Search support docs for a query.", + "parameters": { + "type": "object", + "properties": {"query": {"type": "string"}}, + "required": ["query"], + }, + }, +} + +TICKET_TOOL = { + "type": "function", + "function": { + "name": "create_support_ticket", + "description": "Create a support ticket.", + "parameters": { + "type": "object", + "properties": { + "subject": {"type": "string"}, + "description": {"type": "string"}, + "priority": {"type": "string"}, + }, + "required": ["subject", "description"], + }, + }, +} + + +class ToolCallsRealWorldBenchmark(Benchmark): + """Tool call benchmark for realistic tool routing and multi-turn context.""" + + def __init__( + self, + drafter_model: str = "claude-haiku-4-5-20251001", + verifier_model: str = "claude-opus-4-5-20251101", + quality_threshold: float = 0.7, + max_samples: int = 18, + ): + super().__init__( + dataset_name="ToolCalls-RealWorld-18", + drafter_model=drafter_model, + verifier_model=verifier_model, + baseline_model=verifier_model, + quality_threshold=quality_threshold, + max_samples=max_samples, + ) + self.tools = [ + WEATHER_TOOL, + CURRENCY_TOOL, + ORDER_TOOL, + UPDATE_ORDER_TOOL, + CALENDAR_TOOL, + STOCK_TOOL, + FLIGHT_TOOL, + PROFILE_TOOL, + HELP_TOOL, + TICKET_TOOL, + ] + + def load_dataset(self) -> list[tuple[Any, Any]]: + cases = [ + { + "case_id": "REAL/WEATHER/PARIS", + "query": "What's the weather in Paris?", + "messages": None, + "expected_tool": "get_weather", + "expected_params": {"location": "paris"}, + }, + { + "case_id": "REAL/WEATHER/TOKYO", + "query": "Get the weather in Tokyo for tomorrow.", + "messages": None, + "expected_tool": "get_weather", + "expected_params": {"location": "tokyo", "date": "tomorrow"}, + }, + { + "case_id": "REAL/WEATHER/MULTITURN", + "query": "And tomorrow?", + "messages": [ + {"role": "user", "content": "What's the weather in Berlin?"}, + {"role": "assistant", "content": "Let me check that for you."}, + {"role": "user", "content": "And tomorrow?"}, + ], + "expected_tool": "get_weather", + "expected_params": {"location": "berlin", "date": "tomorrow"}, + }, + { + "case_id": "REAL/CURRENCY/USD_EUR", + "query": "Convert 10 USD to EUR.", + "messages": None, + "expected_tool": "convert_currency", + "expected_params": {"amount": 10, "from_currency": "USD", "to_currency": "EUR"}, + }, + { + "case_id": "REAL/CURRENCY/MULTITURN", + "query": "Actually convert 150 GBP to USD.", + "messages": [ + {"role": "user", "content": "Convert 100 GBP to USD."}, + {"role": "assistant", "content": "Sure, I can help with that."}, + {"role": "user", "content": "Actually convert 150 GBP to USD."}, + ], + "expected_tool": "convert_currency", + "expected_params": {"amount": 150, "from_currency": "GBP", "to_currency": "USD"}, + }, + { + "case_id": "REAL/ORDER/LOOKUP", + "query": "Track order #12345.", + "messages": None, + "expected_tool": "lookup_order", + "expected_params": {"order_id": "12345"}, + }, + { + "case_id": "REAL/ORDER/UPDATE_DIRECT", + "query": "Update order #555 to delivered.", + "messages": None, + "expected_tool": "update_order_status", + "expected_params": {"order_id": "555", "status": "delivered"}, + }, + { + "case_id": "REAL/ORDER/UPDATE_MULTITURN", + "query": "Update it to shipped.", + "messages": [ + {"role": "user", "content": "Track order #12345."}, + {"role": "assistant", "content": "I can help with that."}, + {"role": "user", "content": "Update it to shipped."}, + ], + "expected_tool": "update_order_status", + "expected_params": {"order_id": "12345", "status": "shipped"}, + }, + { + "case_id": "REAL/STOCK/TSLA", + "query": "What's the current TSLA stock price?", + "messages": None, + "expected_tool": "get_stock_quote", + "expected_params": {"symbol": "TSLA"}, + }, + { + "case_id": "REAL/CALENDAR/MEETING", + "query": "Schedule a meeting with Alex tomorrow at 2pm.", + "messages": None, + "expected_tool": "create_calendar_event", + "expected_params": { + "title": "meeting", + "date": "tomorrow", + "time": "2pm", + "attendees": ["alex"], + }, + }, + { + "case_id": "REAL/FLIGHT/BOOK", + "query": "Book a flight from SFO to JFK next Monday.", + "messages": None, + "expected_tool": "book_flight", + "expected_params": { + "origin": "SFO", + "destination": "JFK", + "date": "next monday", + }, + }, + { + "case_id": "REAL/PROFILE/LOOKUP", + "query": "Show me the profile for user 42.", + "messages": None, + "expected_tool": "get_user_profile", + "expected_params": {"user_id": "42"}, + }, + { + "case_id": "REAL/HELP/SEARCH", + "query": "Search the help center for password reset steps.", + "messages": None, + "expected_tool": "search_help_center", + "expected_params": {"query": "password reset"}, + }, + { + "case_id": "REAL/TICKET/CREATE", + "query": "Create a support ticket: login is broken, priority high.", + "messages": None, + "expected_tool": "create_support_ticket", + "expected_params": {"subject": "login", "priority": "high"}, + }, + { + "case_id": "REAL/WEATHER/NYC", + "query": "What's the weather in New York next Friday?", + "messages": None, + "expected_tool": "get_weather", + "expected_params": {"location": "new york", "date": "next friday"}, + }, + { + "case_id": "REAL/CURRENCY/EUR_JPY", + "query": "Convert 99 EUR to JPY.", + "messages": None, + "expected_tool": "convert_currency", + "expected_params": {"amount": 99, "from_currency": "EUR", "to_currency": "JPY"}, + }, + { + "case_id": "REAL/ORDER/LOOKUP/MULTITURN", + "query": "What's the status now?", + "messages": [ + {"role": "user", "content": "Track order #888."}, + {"role": "assistant", "content": "Let me check that for you."}, + {"role": "user", "content": "What's the status now?"}, + ], + "expected_tool": "lookup_order", + "expected_params": {"order_id": "888"}, + }, + { + "case_id": "REAL/CALENDAR/TEAM", + "query": "Create a calendar event for team sync on Friday at 10am with sara@acme.com.", + "messages": None, + "expected_tool": "create_calendar_event", + "expected_params": { + "title": "team sync", + "date": "friday", + "time": "10am", + "attendees": ["sara@acme.com"], + }, + }, + { + "case_id": "REAL/FLIGHT/MULTITURN", + "query": "Make it for two passengers.", + "messages": [ + {"role": "user", "content": "Book a flight from LAX to SEA next Tuesday."}, + {"role": "assistant", "content": "Sure, I can book that."}, + {"role": "user", "content": "Make it for two passengers."}, + ], + "expected_tool": "book_flight", + "expected_params": { + "origin": "LAX", + "destination": "SEA", + "date": "next tuesday", + "passengers": 2, + }, + }, + ] + return [(case, case) for case in cases[: self.max_samples]] + + def evaluate_prediction(self, prediction: Any, ground_truth: Any) -> tuple[bool, float]: + tool_calls = prediction if isinstance(prediction, list) else [] + if not tool_calls: + return False, 0.0 + + expected_tool = ground_truth.get("expected_tool") + expected_params = ground_truth.get("expected_params", {}) + + for tool_call in tool_calls: + name = ( + tool_call.get("name") + or tool_call.get("tool") + or (tool_call.get("function") or {}).get("name") + ) + args = ( + tool_call.get("arguments") + or tool_call.get("args") + or tool_call.get("parameters") + or (tool_call.get("function") or {}).get("arguments") + ) + + if isinstance(args, str): + try: + args = json.loads(args) + except json.JSONDecodeError: + args = {} + + if not isinstance(args, dict): + args = {} + + if name != expected_tool: + continue + + params_match = True + for key, expected_value in expected_params.items(): + if key not in args: + params_match = False + break + if expected_value is None: + continue + actual_value = args.get(key) + if isinstance(expected_value, list): + actual_list = actual_value if isinstance(actual_value, list) else [actual_value] + actual_text = " ".join(str(item).lower() for item in actual_list) + for expected_item in expected_value: + if str(expected_item).lower() not in actual_text: + params_match = False + break + if not params_match: + break + elif isinstance(expected_value, (int, float)): + try: + if float(actual_value) != float(expected_value): + params_match = False + break + except (TypeError, ValueError): + params_match = False + break + elif isinstance(expected_value, str): + if expected_value.lower() not in str(actual_value).lower(): + params_match = False + break + + if params_match: + return True, 1.0 + + return False, 0.0 + + async def run_cascade(self, query: Any) -> dict[str, Any]: + case = query if isinstance(query, dict) else {"query": str(query)} + prompt = case.get("query", "") + messages = case.get("messages") + + agent = CascadeAgent( + models=[ + ModelConfig(name=self.drafter_model, provider="anthropic", cost=0.003), + ModelConfig(name=self.verifier_model, provider="anthropic", cost=0.045), + ], + quality={"threshold": self.quality_threshold}, + ) + + result = await agent.run( + prompt, + max_tokens=220, + temperature=0.0, + tools=self.tools, + tool_choice="auto", + messages=messages, + ) + + prediction = result.tool_calls or [] + + return { + "prediction": prediction, + "model_used": result.model_used, + "accepted": result.draft_accepted, + "quality_score": result.quality_score or 0.0, + "routing_strategy": result.routing_strategy, + "drafter_cost": result.draft_cost or 0.0, + "verifier_cost": result.verifier_cost or 0.0, + "total_cost": result.total_cost, + "latency_ms": result.latency_ms, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), + "tokens_input": result.metadata.get("prompt_tokens", 0), + "tokens_output": result.metadata.get("completion_tokens", 0), + } + + +async def run_tool_calls_realworld_benchmark() -> Any: + benchmark = ToolCallsRealWorldBenchmark() + return await benchmark.run() diff --git a/tests/benchmarks/truthfulqa.py b/tests/benchmarks/truthfulqa.py index 08e7d30c..cc3bced4 100644 --- a/tests/benchmarks/truthfulqa.py +++ b/tests/benchmarks/truthfulqa.py @@ -449,10 +449,17 @@ async def run_cascade(self, query: str) -> dict[str, Any]: "model_used": result.model_used, "accepted": result.draft_accepted, "quality_score": result.quality_score or 0.0, + "routing_strategy": result.routing_strategy, "drafter_cost": result.draft_cost or 0.0, "verifier_cost": result.verifier_cost or 0.0, "total_cost": result.total_cost, "latency_ms": result.latency_ms, + "cascadeflow_latency_ms": ( + (result.complexity_detection_ms or 0) + + (result.metadata.get("domain_detection_ms", 0) if result.metadata else 0) + + (result.metadata.get("tool_complexity_analysis_ms", 0) if result.metadata else 0) + + (result.quality_verification_ms or 0) + ), "tokens_input": result.metadata.get("prompt_tokens", 0), "tokens_output": result.metadata.get("completion_tokens", 0), } diff --git a/tests/benchmarks/utils.py b/tests/benchmarks/utils.py new file mode 100644 index 00000000..1a27fe89 --- /dev/null +++ b/tests/benchmarks/utils.py @@ -0,0 +1,66 @@ +"""Benchmark utilities for model selection and provider resolution.""" + +from __future__ import annotations + +import os +from typing import Optional + +from .benchmark_config import DRAFTER_MODELS, VERIFIER_MODELS + + +def get_env_model(env_name: str, default: str) -> str: + """Return model name overridden by environment variable if set.""" + return os.getenv(env_name, default) + + +def resolve_model_provider(model: str, fallback: str = "openai") -> str: + """Infer provider from model name.""" + model_lower = model.lower() + if "claude" in model_lower or model_lower.startswith("anthropic/"): + return "anthropic" + if model_lower.startswith("gpt") or model_lower.startswith("openai/"): + return "openai" + if model_lower.startswith("gemini") or model_lower.startswith("google/"): + return "google" + if model_lower.startswith("groq/"): + return "groq" + if model_lower.startswith("together/"): + return "together" + if model_lower.startswith("huggingface/"): + return "huggingface" + if model_lower.startswith("deepseek/"): + return "deepseek" + return fallback + + +def resolve_model_cost(model: str, fallback: float) -> float: + """Resolve model cost per 1k tokens from benchmark config tiers.""" + for tier in DRAFTER_MODELS.values(): + if tier.name == model: + return tier.cost_per_1k + for tier in VERIFIER_MODELS.values(): + if tier.name == model: + return tier.cost_per_1k + + model_lower = model.lower() + if "claude-haiku" in model_lower: + return 0.003 + if "claude-opus" in model_lower: + return 0.045 + if "claude-sonnet" in model_lower: + return 0.003 + + return fallback + + +def resolve_model_pair( + default_drafter: str, + default_verifier: str, + env_drafter: str = "DRAFTER_MODEL", + env_verifier: str = "VERIFIER_MODEL", +) -> tuple[str, str]: + """Return drafter/verifier names with env overrides applied.""" + return ( + get_env_model(env_drafter, default_drafter), + get_env_model(env_verifier, default_verifier), + )