diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index e85de66..d3165a2 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -1,7 +1,7 @@ { "$schema": "https://anthropic.com/claude-code/marketplace.schema.json", "name": "braintrust-claude-plugin", - "version": "1.2.0", + "version": "1.3.0", "description": "Braintrust plugins for LLM evaluation, logging, and observability", "owner": { "name": "Braintrust", @@ -12,14 +12,14 @@ "name": "braintrust", "description": "Enables AI agents to use Braintrust for LLM evaluation, logging, and observability. Provides correct API usage, working examples, and helper scripts.", "version": "1.1.0", - "source": "./", + "source": "./plugins/braintrust", "category": "development" }, { "name": "trace-claude-code", "description": "Automatically trace Claude Code conversations to Braintrust. Captures user messages, assistant responses, and tool calls for observability.", - "version": "1.0.0", - "source": "./skills/trace-claude-code", + "version": "1.1.0", + "source": "./plugins/trace-claude-code", "category": "observability" } ] diff --git a/AGENTS.md b/AGENTS.md index d05c756..9b0c185 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,5 +1,34 @@ # Agent guidelines +## About this repository + +This is the **Braintrust Claude Code plugin marketplace** - a repository that distributes Claude Code plugins for Braintrust integration. + +### Structure + +``` +claude-plugin/ +├── .claude-plugin/ +│ └── marketplace.json # Marketplace catalog (lists available plugins) +├── plugins/ +│ ├── braintrust/ # Plugin: Braintrust evaluation & logging +│ └── trace-claude-code/ # Plugin: Session tracing to Braintrust +└── evals/ # Evaluation suite for testing the plugins +``` + +### Plugins + +| Plugin | Description | +|--------|-------------| +| `braintrust` | Enables AI agents to use Braintrust for LLM evaluation, logging, and observability. Includes MCP server config and the `troubleshoot-braintrust-mcp` skill. | +| `trace-claude-code` | Automatically traces Claude Code conversations to Braintrust. Uses hooks to capture sessions, turns, and tool calls. | + +### Terminology + +- **Marketplace**: A repository with a `marketplace.json` that catalogs multiple plugins for distribution +- **Plugin**: An installable unit with its own `.claude-plugin/plugin.json` manifest +- **Skill**: A capability within a plugin (e.g., `troubleshoot-braintrust-mcp` is a skill in the `braintrust` plugin) + ## Style conventions - Use sentence case for all text (capitalize first word only, except for proper nouns and code references) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e1323c9 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,49 @@ +# Development of the plugin itself + +## Prerequisites + +- Python 3.12+ +- [uv](https://docs.astral.sh/uv/) package manager + +## Local testing + +Test a plugin without installing from marketplace: + +```bash +claude --plugin-dir /path/to/thisrepo/plugins/{plugin dir here} +# example +claude --plugin-dir /path/to/thisrepo/plugins/braintrust +``` + +## Running evals + +The `evals/` directory contains tests that verify the plugin works correctly (e.g., Claude generates valid SQL queries, logs data properly). + +```bash +cd evals +export BRAINTRUST_API_KEY="your-key" + +# Run all evals +uv run braintrust eval . + +# Run specific eval +uv run braintrust eval eval_e2e_log_fetch.py +``` + +## Pre-commit hooks + +```bash +# Install hooks +uv run pre-commit install + +# Run all hooks +uv run pre-commit run --all-files +``` + +# Updating the plugin + +After making changes: + +1. Bump version in `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json` +2. Commit and push +3. Users update with: `claude plugin marketplace update braintrust-claude-plugin` diff --git a/README.md b/README.md index 2a0520c..bd4611c 100644 --- a/README.md +++ b/README.md @@ -1,171 +1,52 @@ -# Braintrust Claude plugins +# Braintrust Claude Code Marketplace -Claude Code plugins for Braintrust - LLM evaluation, logging, observability, and tracing. +A Claude Code plugin marketplace for [Braintrust](https://braintrust.dev) integration - LLM evaluation, logging, observability, and session tracing. -## Plugins - -### 1. Braintrust (evaluation & logging) - -Enables AI agents to use Braintrust for LLM evaluation, logging, and observability. - -```bash -claude plugin marketplace add braintrustdata/braintrust-claude-plugin -claude plugin install braintrust@braintrust-claude-plugin -``` - -### 2. Trace Claude Code (observability) - -Automatically trace Claude Code conversations to Braintrust. - -```bash -claude plugin install trace-claude-code@braintrust-claude-plugin -``` +## Prerequisites -See [trace-claude-code/SKILL.md](skills/trace-claude-code/SKILL.md) for setup instructions. +- A [Braintrust account](https://braintrust.dev) +- `BRAINTRUST_API_KEY` exported in your environment -## Agent skills +## Installation -This repo includes skills built on the open [Agent Skills](https://agentskills.io/home) format, compatible with Claude Code, Cursor, Amp, and other agents. +Add the marketplace: -**Install all skills:** ```bash -curl -sL https://github.com/braintrustdata/braintrust-claude-plugin/archive/main.tar.gz | tar -xz -C ~/.claude/skills --strip-components=2 braintrust-claude-plugin-main/skills -``` - -Available skills: -- [using-braintrust](skills/using-braintrust/SKILL.md) - Evaluation, logging, and SQL queries -- [trace-claude-code](skills/trace-claude-code/SKILL.md) - Automatic conversation tracing - -## Setup - -Create a `.env` file in your project directory: - -``` -BRAINTRUST_API_KEY=your-api-key-here -``` - -The plugin scripts automatically load `.env` files from the current directory or parent directories. - -## What the plugin provides - -### Scripts - -The plugin includes ready-to-use scripts for common operations: - -**Query logs with SQL:** -```bash -uv run query_logs.py --project "My Project" --query "SELECT count(*) as count FROM logs WHERE created > now() - interval 1 day" -``` - -**Log data:** -```bash -uv run log_data.py --project "My Project" --input "hello" --output "world" -``` - -**Run evaluations:** -```bash -uv run run_eval.py --project "My Project" --data '[{"input": "test", "expected": "test"}]' -``` - -### SDK patterns - -The skill teaches Claude how to use the Braintrust SDK correctly: - -```python -# Correct Eval() usage - project name is FIRST POSITIONAL arg -braintrust.Eval( - "My Project", # NOT project_name="My Project" - data=lambda: [...], - task=lambda input: ..., - scores=[Factuality], -) - -# Logging with flush -logger = braintrust.init_logger(project="My Project") -logger.log(input="hello", output="world") -logger.flush() # Important! -``` - -### SQL query syntax - -The skill teaches Claude to write SQL queries for Braintrust logs: - -```sql -SELECT input, output, created FROM logs WHERE created > now() - interval 1 day LIMIT 10 -``` - -**SQL quirks in Braintrust:** -- Use `hour()`, `day()`, `month()`, `year()` instead of `date_trunc()` -- Intervals use format `interval 1 day` (no quotes, singular unit) - -## Project structure - -``` -braintrust-claude-plugin/ -├── .claude-plugin/ -│ ├── plugin.json # Plugin manifest -│ └── marketplace.json # Marketplace index -├── skills/ -│ ├── using-braintrust/ -│ │ ├── SKILL.md # Evaluation & logging skill -│ │ └── scripts/ # Helper scripts -│ │ ├── query_logs.py -│ │ ├── log_data.py -│ │ └── run_eval.py -│ └── trace-claude-code/ -│ ├── SKILL.md # Claude Code tracing skill -│ └── hooks/ -│ └── stop_hook.sh # Hook script -├── evals/ # Evaluation suite -│ ├── eval_e2e_*.py # End-to-end tests -│ └── eval_*.py # Baseline tests -└── README.md +claude plugin marketplace add braintrustdata/braintrust-claude-plugin ``` -## Development +Then install the plugins you need: -### Prerequisites +## Plugins -- Python 3.12+ -- [uv](https://docs.astral.sh/uv/) package manager +### braintrust -### Local testing +Enables AI agents to use Braintrust for LLM evaluation, logging, and observability. -Test the plugin without installing from marketplace: +- Query Braintrust projects, experiments, datasets, and logs +- Instrument your code with the Braintrust SDK and write evals ```bash -claude --plugin-dir /path/to/braintrust-claude-plugin +claude plugin install braintrust@braintrust-claude-plugin ``` -### Running evals +### trace-claude-code -The `evals/` directory contains tests that verify the skill works correctly (e.g., Claude generates valid SQL queries, logs data properly). +Automatically traces Claude Code conversations to Braintrust. Captures sessions, conversation turns, and tool calls as hierarchical traces. ```bash -cd evals -export BRAINTRUST_API_KEY="your-key" - -# Run all evals -uv run braintrust eval . - -# Run specific eval -uv run braintrust eval eval_e2e_log_fetch.py +claude plugin install trace-claude-code@braintrust-claude-plugin ``` -### Pre-commit hooks +To enable tracing, add the following to your `~/.claude/settings.json` or your project's `.claude/settings.local.json`: -```bash -# Install hooks -uv run pre-commit install - -# Run all hooks -uv run pre-commit run --all-files +```json +{ + "env": { + "TRACE_TO_BRAINTRUST": "true", + "BRAINTRUST_CC_PROJECT": "project-name-to-send-cc-traces-to" + } +} ``` -## Updating the plugin - -After making changes: - -1. Bump version in `.claude-plugin/plugin.json` and `.claude-plugin/marketplace.json` -2. Commit and push -3. Users update with: `claude plugin marketplace update braintrust-claude-plugin` +Traces are sent to the `claude-code` project by default. diff --git a/evals/eval_datasets.py b/evals/eval_datasets.py index fa4ac90..1034f36 100644 --- a/evals/eval_datasets.py +++ b/evals/eval_datasets.py @@ -142,7 +142,6 @@ def baseline_task(input_str): scores=[criteria_scorer], metadata={ "description": "Tests agent's ability to create and manage Braintrust datasets", - "skill": "using-braintrust", "category": "datasets", }, ) diff --git a/evals/eval_docs_search.py b/evals/eval_docs_search.py index 9b6d96b..6856b11 100644 --- a/evals/eval_docs_search.py +++ b/evals/eval_docs_search.py @@ -159,7 +159,6 @@ def baseline_task(input: str) -> str: scores=[criteria_scorer], metadata={ "description": "Tests agent's ability to answer Braintrust documentation questions", - "skill": "using-braintrust", "category": "docs_search", }, ) diff --git a/evals/eval_e2e_eval_improve.py b/evals/eval_e2e_eval_improve.py index 14890b6..8c4eaf5 100644 --- a/evals/eval_e2e_eval_improve.py +++ b/evals/eval_e2e_eval_improve.py @@ -14,6 +14,7 @@ """ import asyncio +import os import sys import uuid from pathlib import Path @@ -30,41 +31,43 @@ TEST_RUN_ID = str(uuid.uuid4())[:8] TEST_PROJECT_NAME = f"skill-eval-experiment-{TEST_RUN_ID}" -# Load skill content -SKILL_PATH = Path(__file__).parent.parent / "skill" / "SKILL.md" -SKILL_CONTENT = SKILL_PATH.read_text() if SKILL_PATH.exists() else "" - # Setup Claude Agent SDK patching setup_claude_agent_sdk() +# MCP server configuration for Braintrust +MCP_SERVERS = { + "braintrust": { + "type": "http", + "url": "https://api.braintrust.dev/mcp", + "headers": {"Authorization": f"Bearer {os.environ.get('BRAINTRUST_API_KEY', '')}"}, + } +} + -async def run_claude_agent(prompt: str, max_turns: int = 15, use_skill: bool = True) -> dict: +async def run_claude_agent(prompt: str, max_turns: int = 15, use_mcp: bool = True) -> dict: """ Run Claude Agent with code execution enabled. + + Args: + prompt: The prompt to send to Claude + max_turns: Maximum number of conversation turns + use_mcp: If True, connect the Braintrust MCP server for enhanced capabilities """ - base_prompt = """You are an expert at Braintrust, an LLM evaluation platform. + system_prompt = """You are an expert at Braintrust, an LLM evaluation platform. You have access to code execution. Use Python to complete the tasks. -Be concise and execute code directly - don't just explain.""" +Be concise and execute code directly - don't just explain. - if use_skill and SKILL_CONTENT: - system_prompt = f"""{base_prompt} - -Here is the reference documentation for using Braintrust: - -{SKILL_CONTENT} - -Follow the examples in the documentation exactly. Pay special attention to: +When running evals, use braintrust.Eval() with proper task and scorer functions. +Pay special attention to: - Eval() takes the project name as the FIRST POSITIONAL argument, not a keyword argument - Always call logger.flush() after logging""" - else: - system_prompt = f"""{base_prompt} -When running evals, use braintrust.Eval() with proper task and scorer functions.""" options = ClaudeAgentOptions( model="claude-sonnet-4-5-20250929", system_prompt=system_prompt, max_turns=max_turns, permission_mode="bypassPermissions", + mcp_servers=MCP_SERVERS if use_mcp else {}, ) success = False @@ -319,7 +322,6 @@ def eval_ran_scorer(output: dict, expected: dict, **kwargs) -> Score: scores=[experiments_created_scorer, task_completed_scorer, eval_ran_scorer], metadata={ "description": "Tests Claude's ability to create and run experiments, verified via Braintrust API", - "skill": "using-braintrust", "category": "e2e", "test_run_id": TEST_RUN_ID, "test_project": TEST_PROJECT_NAME, diff --git a/evals/eval_e2e_log_fetch.py b/evals/eval_e2e_log_fetch.py index 095b236..8b2d28a 100644 --- a/evals/eval_e2e_log_fetch.py +++ b/evals/eval_e2e_log_fetch.py @@ -12,6 +12,7 @@ """ import asyncio +import os import sys import uuid from pathlib import Path @@ -28,42 +29,44 @@ TEST_RUN_ID = str(uuid.uuid4())[:8] TEST_PROJECT_NAME = f"skill-eval-e2e-{TEST_RUN_ID}" -# Load skill content -SKILL_PATH = Path(__file__).parent.parent / "skill" / "SKILL.md" -SKILL_CONTENT = SKILL_PATH.read_text() if SKILL_PATH.exists() else "" - # Setup Claude Agent SDK patching (will trace within parent span context) setup_claude_agent_sdk() +# MCP server configuration for Braintrust +MCP_SERVERS = { + "braintrust": { + "type": "http", + "url": "https://api.braintrust.dev/mcp", + "headers": {"Authorization": f"Bearer {os.environ.get('BRAINTRUST_API_KEY', '')}"}, + } +} + -async def run_claude_agent(prompt: str, max_turns: int = 10, use_skill: bool = True) -> dict: +async def run_claude_agent(prompt: str, max_turns: int = 10, use_mcp: bool = True) -> dict: """ Run Claude Agent with code execution enabled and collect results. Returns dict with 'success', 'output', 'error' fields. + + Args: + prompt: The prompt to send to Claude + max_turns: Maximum number of conversation turns + use_mcp: If True, connect the Braintrust MCP server for enhanced capabilities """ - base_prompt = """You are an expert at Braintrust, an LLM evaluation platform. + system_prompt = """You are an expert at Braintrust, an LLM evaluation platform. You have access to code execution. Use Python to complete the tasks. -Be concise and execute code directly - don't just explain.""" - - if use_skill and SKILL_CONTENT: - system_prompt = f"""{base_prompt} - -Here is the reference documentation for using Braintrust: - -{SKILL_CONTENT} +Be concise and execute code directly - don't just explain. -Follow the examples in the documentation exactly. Pay special attention to: +Always use the braintrust SDK for logging and querying. +Pay special attention to: - Always call logger.flush() after logging to ensure data is sent - Use init_logger(project="name") to create a logger""" - else: - system_prompt = f"""{base_prompt} -Always use the braintrust SDK for logging and querying.""" options = ClaudeAgentOptions( model="claude-sonnet-4-5-20250929", system_prompt=system_prompt, max_turns=max_turns, permission_mode="bypassPermissions", + mcp_servers=MCP_SERVERS if use_mcp else {}, ) success = False @@ -354,7 +357,6 @@ def sql_query_scorer(output: dict, expected: dict, **kwargs) -> Score: scores=[logs_created_scorer, correct_count_scorer, task_completed_scorer], metadata={ "description": "Tests Claude's ability to log data, verified by querying Braintrust directly", - "skill": "using-braintrust", "category": "e2e", "test_run_id": TEST_RUN_ID, "test_project": TEST_PROJECT_NAME, @@ -369,7 +371,6 @@ def sql_query_scorer(output: dict, expected: dict, **kwargs) -> Score: scores=[sql_query_scorer, task_completed_scorer], metadata={ "description": "Tests Claude's ability to query logs using SQL syntax", - "skill": "using-braintrust", "category": "e2e", "test_run_id": TEST_RUN_ID, }, diff --git a/evals/eval_experiments.py b/evals/eval_experiments.py index 4e7993a..e77d49b 100644 --- a/evals/eval_experiments.py +++ b/evals/eval_experiments.py @@ -142,7 +142,6 @@ def baseline_task(input_str): scores=[criteria_scorer], metadata={ "description": "Tests agent's ability to create, run, and analyze Braintrust experiments", - "skill": "using-braintrust", "category": "experiments", }, ) diff --git a/evals/eval_log_querying.py b/evals/eval_log_querying.py index cd2dea2..0ed0a3c 100644 --- a/evals/eval_log_querying.py +++ b/evals/eval_log_querying.py @@ -126,7 +126,6 @@ def baseline_task(input_data): scores=[criteria_scorer], metadata={ "description": "Tests agent's ability to write correct SQL queries and log operations", - "skill": "using-braintrust", "category": "log_querying", }, ) diff --git a/.claude-plugin/plugin.json b/plugins/braintrust/.claude-plugin/plugin.json similarity index 92% rename from .claude-plugin/plugin.json rename to plugins/braintrust/.claude-plugin/plugin.json index 5d859ea..a14383a 100644 --- a/.claude-plugin/plugin.json +++ b/plugins/braintrust/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "braintrust", "description": "Enables AI agents to use Braintrust for LLM evaluation, logging, and observability. Provides correct API usage, working examples, and helper scripts for common operations.", - "version": "1.1.0", + "version": "1.3.0", "author": { "name": "Braintrust" } diff --git a/plugins/braintrust/.mcp.json b/plugins/braintrust/.mcp.json new file mode 100644 index 0000000..16048c4 --- /dev/null +++ b/plugins/braintrust/.mcp.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "braintrust": { + "type": "http", + "url": "https://api.braintrust.dev/mcp", + "headers": { + "Authorization": "Bearer ${BRAINTRUST_API_KEY}" + } + } + } +} diff --git a/plugins/braintrust/skills/troubleshoot-braintrust-mcp/SKILL.md b/plugins/braintrust/skills/troubleshoot-braintrust-mcp/SKILL.md new file mode 100644 index 0000000..24f566e --- /dev/null +++ b/plugins/braintrust/skills/troubleshoot-braintrust-mcp/SKILL.md @@ -0,0 +1,46 @@ +--- +name: troubleshoot-braintrust-mcp +description: | + This plugin auto-configures a "braintrust" MCP server. If you can't see it or reach it, activate this skill +version: 1.0.0 +--- + +This Claude plugin automatically sets up a Braintrust MCP connection. The connection reads the `BRAINTRUST_API_KEY` environment variable to establish the MCP connection. + +## Troubleshooting steps + +### 1. Verify the environment variable is set + +Run `echo $BRAINTRUST_API_KEY` to check if the variable is exported + +API keys can be created at https://www.braintrust.dev/app/settings?subroute=api-keys + +### 2. Verify the API key is valid + +Test the key by calling the Braintrust API: + +```bash +curl -s https://api.braintrust.dev/api/self/me -H "Authorization: Bearer $BRAINTRUST_API_KEY" +``` + +- If valid: returns JSON with user info (id, email, organizations, etc.) +- If invalid: returns an authentication error + +NOTE: Even if you can curl the api via http, continue to attempt MCP setup. Http is just a troubleshooting tool, not a replacement for MCP + +### 3. Check if the MCP server is reachable + +If the key is valid but connection still fails, check if the MCP server is up: + +```bash +curl -s -o /dev/null -w "%{http_code}" https://api.braintrust.dev/mcp +``` + +- Any HTTP response (even 401 or 405) means the server is reachable +- Connection timeout or "connection refused" means the server may be down + +### 4. Contact support + +If nothing else works, encourage the user to reach out: +- Discord: https://discord.com/invite/6G8s47F44X +- Email: support@braintrust.dev diff --git a/plugins/trace-claude-code/.claude-plugin/plugin.json b/plugins/trace-claude-code/.claude-plugin/plugin.json new file mode 100644 index 0000000..ccf7d47 --- /dev/null +++ b/plugins/trace-claude-code/.claude-plugin/plugin.json @@ -0,0 +1,8 @@ +{ + "name": "trace-claude-code", + "description": "Automatically trace Claude Code conversations to Braintrust for observability. Captures sessions, conversation turns, and tool calls as hierarchical traces.", + "version": "1.1.0", + "author": { + "name": "Braintrust" + } +} diff --git a/skills/trace-claude-code/hooks/common.sh b/plugins/trace-claude-code/hooks/common.sh similarity index 96% rename from skills/trace-claude-code/hooks/common.sh rename to plugins/trace-claude-code/hooks/common.sh index 237b6e7..452d613 100755 --- a/skills/trace-claude-code/hooks/common.sh +++ b/plugins/trace-claude-code/hooks/common.sh @@ -88,13 +88,14 @@ check_requirements() { return 0 } -# Get or create project ID (cached) +# Get or create project ID (cached per project name) get_project_id() { local name="$1" + local cache_key="project_id_$name" # Check cache first local cached_id - cached_id=$(get_state_value "project_id") + cached_id=$(get_state_value "$cache_key") if [ -n "$cached_id" ]; then echo "$cached_id" return 0 @@ -112,7 +113,7 @@ get_project_id() { pid=$(echo "$resp" | jq -r '.id // empty' 2>/dev/null) if [ -n "$pid" ]; then - set_state_value "project_id" "$pid" + set_state_value "$cache_key" "$pid" echo "$pid" return 0 fi @@ -124,7 +125,7 @@ get_project_id() { pid=$(echo "$resp" | jq -r '.id // empty' 2>/dev/null) if [ -n "$pid" ]; then - set_state_value "project_id" "$pid" + set_state_value "$cache_key" "$pid" echo "$pid" return 0 fi diff --git a/plugins/trace-claude-code/hooks/hooks.json b/plugins/trace-claude-code/hooks/hooks.json new file mode 100644 index 0000000..0b5db46 --- /dev/null +++ b/plugins/trace-claude-code/hooks/hooks.json @@ -0,0 +1,55 @@ +{ + "hooks": { + "SessionStart": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/session_start.sh" + } + ] + } + ], + "UserPromptSubmit": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/user_prompt_submit.sh" + } + ] + } + ], + "PostToolUse": [ + { + "matcher": "*", + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/post_tool_use.sh" + } + ] + } + ], + "Stop": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/stop_hook.sh" + } + ] + } + ], + "SessionEnd": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/session_end.sh" + } + ] + } + ] + } +} diff --git a/skills/trace-claude-code/hooks/post_tool_use.sh b/plugins/trace-claude-code/hooks/post_tool_use.sh similarity index 100% rename from skills/trace-claude-code/hooks/post_tool_use.sh rename to plugins/trace-claude-code/hooks/post_tool_use.sh diff --git a/skills/trace-claude-code/hooks/session_end.sh b/plugins/trace-claude-code/hooks/session_end.sh similarity index 100% rename from skills/trace-claude-code/hooks/session_end.sh rename to plugins/trace-claude-code/hooks/session_end.sh diff --git a/skills/trace-claude-code/hooks/session_start.sh b/plugins/trace-claude-code/hooks/session_start.sh similarity index 98% rename from skills/trace-claude-code/hooks/session_start.sh rename to plugins/trace-claude-code/hooks/session_start.sh index 7a584bf..620110a 100755 --- a/skills/trace-claude-code/hooks/session_start.sh +++ b/plugins/trace-claude-code/hooks/session_start.sh @@ -9,6 +9,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/common.sh" debug "SessionStart hook triggered" +debug "TRACE_TO_BRAINTRUST=$TRACE_TO_BRAINTRUST" tracing_enabled || { debug "Tracing disabled"; exit 0; } check_requirements || exit 0 diff --git a/skills/trace-claude-code/hooks/stop_hook.sh b/plugins/trace-claude-code/hooks/stop_hook.sh similarity index 100% rename from skills/trace-claude-code/hooks/stop_hook.sh rename to plugins/trace-claude-code/hooks/stop_hook.sh diff --git a/skills/trace-claude-code/hooks/user_prompt_submit.sh b/plugins/trace-claude-code/hooks/user_prompt_submit.sh similarity index 100% rename from skills/trace-claude-code/hooks/user_prompt_submit.sh rename to plugins/trace-claude-code/hooks/user_prompt_submit.sh diff --git a/skills/trace-claude-code/setup.sh b/plugins/trace-claude-code/setup.sh similarity index 100% rename from skills/trace-claude-code/setup.sh rename to plugins/trace-claude-code/setup.sh diff --git a/skills/trace-claude-code/SKILL.md b/skills/trace-claude-code/SKILL.md deleted file mode 100644 index 0c1bda3..0000000 --- a/skills/trace-claude-code/SKILL.md +++ /dev/null @@ -1,247 +0,0 @@ ---- -name: trace-claude-code -description: | - Automatically trace Claude Code conversations to Braintrust for observability. - Captures sessions, conversation turns, and tool calls as hierarchical traces. -version: 1.1.0 ---- - -# Trace Claude Code to Braintrust - -Automatically send Claude Code conversations to Braintrust for tracing and observability. Get full visibility into your AI coding sessions with hierarchical traces showing sessions, turns, and every tool call. - -## What you get - -``` -Claude Code Session (root trace) -├── Turn 1: "Add error handling" -│ ├── Read: src/app.ts -│ ├── Edit: src/app.ts -│ └── Response: "I've added try-catch..." -├── Turn 2: "Now run the tests" -│ ├── Terminal: npm test -│ └── Response: "All tests pass..." -└── Turn 3: "Great, commit this" - ├── Terminal: git add . - ├── Terminal: git commit -m "..." - └── Response: "Changes committed..." -``` - -## How it works - -Four hooks capture the complete workflow: - -| Hook | What it captures | -|------|------------------| -| **SessionStart** | Creates root trace when you start Claude Code | -| **PostToolUse** | Captures every tool call (file reads, edits, terminal commands) | -| **Stop** | Captures conversation turns (your message + Claude's response) | -| **SessionEnd** | Logs session summary when you exit | - -## Quick setup - -Run the setup script in any project directory where you want tracing: - -```bash -bash /path/to/skills/trace-claude-code/setup.sh -``` - -The script prompts for your API key and project name, then configures all hooks automatically. - -## Manual setup - -### Prerequisites - -- [Claude Code CLI](https://docs.anthropic.com/en/docs/claude-code) installed -- [Braintrust API key](https://www.braintrust.dev/app/settings/api-keys) -- `jq` command-line tool (`brew install jq` on macOS) - -### Configuration - -Create `.claude/settings.local.json` in your project directory: - -```json -{ - "hooks": { - "SessionStart": [ - { - "hooks": [ - { - "type": "command", - "command": "bash /path/to/hooks/session_start.sh" - } - ] - } - ], - "PostToolUse": [ - { - "matcher": "*", - "hooks": [ - { - "type": "command", - "command": "bash /path/to/hooks/post_tool_use.sh" - } - ] - } - ], - "Stop": [ - { - "hooks": [ - { - "type": "command", - "command": "bash /path/to/hooks/stop_hook.sh" - } - ] - } - ], - "SessionEnd": [ - { - "hooks": [ - { - "type": "command", - "command": "bash /path/to/hooks/session_end.sh" - } - ] - } - ] - }, - "env": { - "TRACE_TO_BRAINTRUST": "true", - "BRAINTRUST_API_KEY": "sk-...", - "BRAINTRUST_CC_PROJECT": "my-project" - } -} -``` - -Replace `/path/to/hooks/` with the actual path to this skill's hooks directory. - -### Environment variables - -| Variable | Required | Description | -|----------|----------|-------------| -| `TRACE_TO_BRAINTRUST` | Yes | Set to `"true"` to enable tracing | -| `BRAINTRUST_API_KEY` | Yes | Your Braintrust API key | -| `BRAINTRUST_CC_PROJECT` | No | Project name (default: `claude-code`) | -| `BRAINTRUST_CC_DEBUG` | No | Set to `"true"` for verbose logging | - -## Viewing traces - -After running Claude Code with tracing enabled: - -1. Go to [braintrust.dev](https://www.braintrust.dev) -2. Navigate to your project (e.g., `claude-code`) -3. Click **Logs** to see all traced sessions - -Each trace shows: -- **Session root**: The overall Claude Code session -- **Turns**: Each conversation exchange (user input → assistant response) -- **Tool calls**: Individual operations (file reads, edits, terminal commands) - -## Trace structure - -Traces are hierarchical: - -- **Session** (root span) - - `span_attributes.type`: `"task"` - - `metadata.session_id`: Unique session identifier - - `metadata.workspace`: Project directory - -- **Turn** (child of session) - - `span_attributes.type`: `"llm"` - - `input`: User message - - `output`: Assistant response - - `metadata.turn_number`: Sequential turn number - -- **Tool call** (child of turn or session) - - `span_attributes.type`: `"tool"` - - `input`: Tool input (file path, command, etc.) - - `output`: Tool result - - `metadata.tool_name`: Name of the tool used - -## Troubleshooting - -### No traces appearing - -1. **Check hooks are running:** - ```bash - tail -f ~/.claude/state/braintrust_hook.log - ``` - -2. **Verify environment variables** in `.claude/settings.local.json`: - - `TRACE_TO_BRAINTRUST` must be `"true"` - - `BRAINTRUST_API_KEY` must be valid - -3. **Enable debug mode:** - ```json - { - "env": { - "BRAINTRUST_CC_DEBUG": "true" - } - } - ``` - -### Permission errors - -Make hook scripts executable: - -```bash -chmod +x /path/to/hooks/*.sh -``` - -### Missing jq command - -Install jq: -- **macOS**: `brew install jq` -- **Ubuntu/Debian**: `sudo apt-get install jq` - -### State issues - -Reset the tracing state: - -```bash -rm ~/.claude/state/braintrust_state.json -``` - -### Hook logs - -View detailed hook execution logs: - -```bash -# Follow logs in real-time -tail -f ~/.claude/state/braintrust_hook.log - -# View last 50 lines -tail -50 ~/.claude/state/braintrust_hook.log - -# Clear logs -> ~/.claude/state/braintrust_hook.log -``` - -## File structure - -``` -hooks/ -├── common.sh # Shared utilities (logging, API, state) -├── session_start.sh # Creates root trace span -├── post_tool_use.sh # Captures tool calls -├── stop_hook.sh # Captures conversation turns -└── session_end.sh # Finalizes trace -``` - -## Alternative: SDK integration - -For programmatic use with the Claude Agent SDK, use the native Braintrust integration: - -```typescript -import { initLogger, wrapClaudeAgentSDK } from "braintrust"; -import * as claudeSDK from "@anthropic-ai/claude-agent-sdk"; - -initLogger({ - projectName: "my-project", - apiKey: process.env.BRAINTRUST_API_KEY, -}); - -const { query, tool } = wrapClaudeAgentSDK(claudeSDK); -``` - -See [Braintrust Claude Agent SDK docs](https://www.braintrust.dev/docs/integrations/sdk-integrations/claude-agent-sdk) for details. diff --git a/skills/using-braintrust/SKILL.md b/skills/using-braintrust/SKILL.md deleted file mode 100644 index 0cc455b..0000000 --- a/skills/using-braintrust/SKILL.md +++ /dev/null @@ -1,170 +0,0 @@ ---- -name: using-braintrust -description: | - Enables AI agents to use Braintrust for LLM evaluation, logging, and observability. - Includes scripts for querying logs with SQL, running evals, and logging data. -version: 1.0.0 ---- - -# Using Braintrust - -Braintrust is a platform for evaluating, logging, and monitoring LLM applications. - -## Listing projects - -Use `scripts/list_projects.py` to see all available projects: - -```bash -uv run /path/to/scripts/list_projects.py -``` - -## Querying logs with SQL - -Use the `query_logs.py` script to run SQL queries against Braintrust logs. - -**Always share the SQL query you used** when reporting results, so the user understands what was executed. - -**Script location:** `scripts/query_logs.py` (relative to this file) - -**Run from the user's project directory** (where `.env` with `BRAINTRUST_API_KEY` exists): - -```bash -uv run /path/to/scripts/query_logs.py --project "Project Name" --query "SQL_QUERY" -``` - -### Common queries - -**Count logs from last 24 hours:** -```sql -SELECT count(*) as count FROM logs WHERE created > now() - interval 1 day -``` - -**Get recent logs:** -```sql -SELECT input, output, created FROM logs ORDER BY created DESC LIMIT 10 -``` - -**Filter by metadata:** -```sql -SELECT input, output FROM logs WHERE metadata.user_id = 'user123' LIMIT 20 -``` - -**Filter by time range:** -```sql -SELECT * FROM logs WHERE created > now() - interval 7 day LIMIT 50 -``` - -**Aggregate by field:** -```sql -SELECT metadata.model, count(*) as count FROM logs GROUP BY metadata.model -``` - -**Group by hour:** -```sql -SELECT hour(created) as hr, count(*) as count FROM logs GROUP BY hour(created) -``` - -### SQL quirks in Braintrust - -- **Time functions**: Use `hour()`, `day()`, `month()`, `year()` instead of `date_trunc()` - - ✅ `hour(created)` - - ❌ `date_trunc('hour', created)` -- **Intervals**: Use `interval 1 day`, `interval 7 day`, `interval 1 hour` (no quotes, singular unit) -- **Nested fields**: Use dot notation: `metadata.user_id`, `scores.Factuality`, `metrics.duration` -- **Table name**: Always use `FROM logs` (the script handles project scoping) - -### SQL reference - -**Operators:** -- `=`, `!=`, `>`, `<`, `>=`, `<=` -- `IS NULL`, `IS NOT NULL` -- `LIKE 'pattern%'` -- `AND`, `OR`, `NOT` - -**Aggregations:** -- `count(*)`, `count(field)` -- `avg(field)`, `sum(field)` -- `min(field)`, `max(field)` - -**Time filters:** -- `created > now() - interval 1 day` -- `created > now() - interval 7 day` -- `created > now() - interval 1 hour` - -## Logging data - -Use `scripts/log_data.py` to log data to a project: - -```bash -uv run /path/to/scripts/log_data.py --project "Project Name" --input "query" --output "response" -``` - -With metadata: -```bash ---input "query" --output "response" --metadata '{"user_id": "123"}' -``` - -Batch from JSON: -```bash ---data '[{"input": "a", "output": "b"}, {"input": "c", "output": "d"}]' -``` - -## Running evaluations - -Use `scripts/run_eval.py` to run evaluations: - -```bash -uv run /path/to/scripts/run_eval.py --project "Project Name" --data '[{"input": "test", "expected": "test"}]' -``` - -From file: -```bash ---data-file test_cases.json --scorer factuality -``` - -## Setup - -Create a `.env` file in your project directory: - -``` -BRAINTRUST_API_KEY=your-api-key-here -``` - -## Writing evaluation code (SDK) - -For custom evaluation logic, use the SDK directly. - -**IMPORTANT**: First argument to `Eval()` is the project name (positional). - -```python -import braintrust -from autoevals import Factuality - -braintrust.Eval( - "My Project", # Project name (required, positional) - data=lambda: [{"input": "What is 2+2?", "expected": "4"}], - task=lambda input: my_llm_call(input), - scores=[Factuality], -) -``` - -**Common mistakes:** -- ❌ `Eval(project_name="My Project", ...)` - Wrong! -- ❌ `Eval(name="My Project", ...)` - Wrong! -- ✅ `Eval("My Project", data=..., task=..., scores=...)` - Correct! - -## Writing logging code (SDK) - -```python -import braintrust - -logger = braintrust.init_logger(project="My Project") -logger.log(input="query", output="response", metadata={"user_id": "123"}) -logger.flush() # Always flush! -``` - -## Common issues - -- **"Eval() got an unexpected keyword argument 'project_name'"**: Use positional argument -- **Logs not appearing**: Call `logger.flush()` after logging -- **Authentication errors**: Create `.env` file with `BRAINTRUST_API_KEY=your-key` diff --git a/skills/using-braintrust/scripts/_common.py b/skills/using-braintrust/scripts/_common.py deleted file mode 100644 index e1b4226..0000000 --- a/skills/using-braintrust/scripts/_common.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Common utilities for Braintrust scripts. - -This module provides shared functionality for loading environment variables, -checking API keys, and initializing the Braintrust SDK. -""" - -import os -import sys -from pathlib import Path - -from dotenv import load_dotenv - - -def load_env(): - """Load environment from .env file in current directory or parents.""" - for path in [Path.cwd(), *Path.cwd().parents]: - env_file = path / ".env" - if env_file.exists(): - load_dotenv(env_file) - return True - return False - - -def require_api_key(): - """Ensure BRAINTRUST_API_KEY is set, exit with error if not.""" - if not os.environ.get("BRAINTRUST_API_KEY"): - print("Error: BRAINTRUST_API_KEY not found.", file=sys.stderr) - print("Set it via environment variable or create a .env file with:", file=sys.stderr) - print(' BRAINTRUST_API_KEY="your-api-key"', file=sys.stderr) - sys.exit(1) - - -def init_braintrust(): - """ - Initialize Braintrust SDK: load env, check API key, and login. - - This handles API URL discovery automatically via the login endpoint. - Supports BRAINTRUST_APP_URL env var for alternate deployments. - """ - import braintrust - - load_env() - require_api_key() - braintrust.login() - - -def get_api_conn(): - """Get the Braintrust API connection.""" - import braintrust - - braintrust.login() # No-op if already logged in - return braintrust.api_conn() diff --git a/skills/using-braintrust/scripts/list_projects.py b/skills/using-braintrust/scripts/list_projects.py deleted file mode 100644 index f53fff0..0000000 --- a/skills/using-braintrust/scripts/list_projects.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.9" -# dependencies = ["braintrust", "python-dotenv"] -# /// -""" -List Braintrust projects. - -Usage: - uv run list_projects.py - uv run list_projects.py --limit 20 - -Environment variables: - BRAINTRUST_API_KEY: Your Braintrust API key (required) - BRAINTRUST_APP_URL: Braintrust app URL (default: https://www.braintrust.dev) -""" - -import argparse - -from _common import get_api_conn, init_braintrust - - -def main(): - parser = argparse.ArgumentParser(description="List Braintrust projects") - parser.add_argument("--limit", type=int, default=50, help="Maximum number of projects to list") - args = parser.parse_args() - - init_braintrust() - conn = get_api_conn() - - resp = conn.get("v1/project", params={"limit": args.limit}) - - if resp.status_code != 200: - print(f"Error: {resp.status_code} - {resp.text}") - return - - projects = resp.json().get("objects", []) - - if not projects: - print("No projects found.") - return - - print(f"Found {len(projects)} projects:\n") - for p in projects: - name = p.get("name", "unnamed") - project_id = p.get("id", "") - created = p.get("created", "")[:10] if p.get("created") else "" - print(f" - {name}") - print(f" ID: {project_id}") - if created: - print(f" Created: {created}") - print() - - -if __name__ == "__main__": - main() diff --git a/skills/using-braintrust/scripts/log_data.py b/skills/using-braintrust/scripts/log_data.py deleted file mode 100644 index ff40d1d..0000000 --- a/skills/using-braintrust/scripts/log_data.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.9" -# dependencies = ["braintrust", "python-dotenv"] -# /// -""" -Log data to a Braintrust project. - -Usage: - uv run log_data.py --project "My Project" --input "hello" --output "world" - uv run log_data.py --project "My Project" --data '[{"input": "a", "output": "b"}]' - -Environment variables: - BRAINTRUST_API_KEY: Your Braintrust API key (required) - BRAINTRUST_APP_URL: Braintrust app URL (default: https://www.braintrust.dev) -""" - -import argparse -import json -import sys - -import braintrust -from _common import load_env, require_api_key - - -def main(): - parser = argparse.ArgumentParser(description="Log data to Braintrust") - parser.add_argument("--project", required=True, help="Project name") - parser.add_argument("--input", help="Input value") - parser.add_argument("--output", help="Output value") - parser.add_argument("--expected", help="Expected value (optional)") - parser.add_argument("--metadata", help="JSON metadata (optional)") - parser.add_argument("--scores", help="JSON scores (optional)") - parser.add_argument("--data", help="JSON array of log entries") - parser.add_argument("--data-file", help="Path to JSON file with log entries") - args = parser.parse_args() - - load_env() - require_api_key() - - logger = braintrust.init_logger(project=args.project) - - # Batch logging - if args.data or args.data_file: - if args.data: - entries = json.loads(args.data) - else: - with open(args.data_file) as f: - entries = json.load(f) - - if not isinstance(entries, list): - entries = [entries] - - for entry in entries: - logger.log(**entry) - - logger.flush() - print(f"Logged {len(entries)} entries to project: {args.project}") - return - - # Single entry logging - if not args.input: - print("Error: Provide --input or --data/--data-file", file=sys.stderr) - sys.exit(1) - - log_kwargs = {"input": args.input} - - if args.output: - log_kwargs["output"] = args.output - if args.expected: - log_kwargs["expected"] = args.expected - if args.metadata: - log_kwargs["metadata"] = json.loads(args.metadata) - if args.scores: - log_kwargs["scores"] = json.loads(args.scores) - - logger.log(**log_kwargs) - logger.flush() - - print(f"Logged entry to project: {args.project}") - print(f" Input: {args.input}") - if args.output: - print(f" Output: {args.output}") - - -if __name__ == "__main__": - main() diff --git a/skills/using-braintrust/scripts/query_logs.py b/skills/using-braintrust/scripts/query_logs.py deleted file mode 100644 index 86ad46c..0000000 --- a/skills/using-braintrust/scripts/query_logs.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.9" -# dependencies = ["braintrust", "python-dotenv"] -# /// -""" -Execute a SQL query against Braintrust project logs. - -Usage: - uv run query_logs.py --project "My Project" --query "SELECT input, output FROM logs LIMIT 10" - uv run query_logs.py --project "My Project" --query "SELECT count(*) as count FROM logs WHERE created > now() - interval '1 day'" - -Environment variables: - BRAINTRUST_API_KEY: Your Braintrust API key (required) - BRAINTRUST_APP_URL: Braintrust app URL (default: https://www.braintrust.dev) -""" - -import argparse -import json -import re -import sys - -from _common import get_api_conn, init_braintrust - - -def get_project_id(project_name: str) -> str: - """Get project ID from name using the SDK's API connection.""" - conn = get_api_conn() - - # Try to get by name - resp = conn.get("v1/project", params={"project_name": project_name}) - if resp.status_code == 200: - projects = resp.json().get("objects", []) - if projects: - return projects[0]["id"] - - # Try listing all projects and matching by name - resp = conn.get("v1/project") - if resp.status_code == 200: - projects = resp.json().get("objects", []) - for p in projects: - if p.get("name", "").lower() == project_name.lower(): - return p["id"] - - print(f"Error: Project '{project_name}' not found", file=sys.stderr) - print("Available projects:", file=sys.stderr) - if resp.status_code == 200: - for p in resp.json().get("objects", [])[:10]: - print(f" - {p.get('name')}", file=sys.stderr) - sys.exit(1) - - -def run_sql(project_id: str, query: str) -> list[dict]: - """Execute SQL query against Braintrust logs using the SDK's API connection.""" - conn = get_api_conn() - - # Replace "FROM logs" with the project-scoped source - full_query = re.sub( - r"\bFROM\s+logs\b", f"FROM project_logs('{project_id}')", query, flags=re.IGNORECASE - ) - - resp = conn.post("btql", json={"query": full_query, "fmt": "json"}) - - if resp.status_code == 200: - return resp.json().get("data", []) - else: - print(f"Error: {resp.status_code} - {resp.text}", file=sys.stderr) - sys.exit(1) - - -def main(): - parser = argparse.ArgumentParser(description="Execute SQL query against Braintrust logs") - parser.add_argument("--project", required=True, help="Project name") - parser.add_argument( - "--query", required=True, help="SQL query (use 'FROM logs' for the project)" - ) - parser.add_argument( - "--format", choices=["json", "table"], default="table", help="Output format" - ) - args = parser.parse_args() - - init_braintrust() - - project_id = get_project_id(args.project) - - # Show the SQL query being executed - executed_query = re.sub( - r"\bFROM\s+logs\b", f"FROM project_logs('{project_id}')", args.query, flags=re.IGNORECASE - ) - print(f"Executing SQL: {executed_query}\n", file=sys.stderr) - - results = run_sql(project_id, args.query) - - if args.format == "json": - print(json.dumps(results, indent=2, default=str)) - else: - if not results: - print("No results") - elif len(results) == 1 and len(results[0]) == 1: - # Single value result (like count) - key, value = list(results[0].items())[0] - print(f"{key}: {value}") - else: - print(f"Found {len(results)} results:\n") - for i, row in enumerate(results): - print(f"--- Result {i+1} ---") - for key, value in row.items(): - val_str = str(value)[:200] if value else "null" - print(f" {key}: {val_str}") - print() - - -if __name__ == "__main__": - main() diff --git a/skills/using-braintrust/scripts/run_eval.py b/skills/using-braintrust/scripts/run_eval.py deleted file mode 100644 index 6327f44..0000000 --- a/skills/using-braintrust/scripts/run_eval.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.9" -# dependencies = ["braintrust", "autoevals", "python-dotenv"] -# /// -""" -Run a Braintrust evaluation with custom data. - -Usage: - uv run run_eval.py --project "My Project" --data '[{"input": "test", "expected": "test"}]' - uv run run_eval.py --project "My Project" --data-file data.json - -Environment variables: - BRAINTRUST_API_KEY: Your Braintrust API key (required) - BRAINTRUST_APP_URL: Braintrust app URL (default: https://www.braintrust.dev) -""" - -import argparse -import json -import sys - -import braintrust -from _common import load_env, require_api_key -from autoevals import Factuality, Score - - -def simple_task(input_data): - """Default task that just echoes input. Replace with your LLM call.""" - if isinstance(input_data, dict): - return str(input_data.get("input", input_data)) - return str(input_data) - - -def exact_match_scorer(input, output, expected=None, **kwargs): - """Scorer that checks for exact match with expected.""" - if expected is None: - return Score(name="Exact Match", score=1.0, metadata={"reason": "no expected"}) - - match = str(output).strip().lower() == str(expected).strip().lower() - return Score( - name="Exact Match", - score=1.0 if match else 0.0, - metadata={"output": str(output)[:100], "expected": str(expected)[:100]}, - ) - - -def main(): - parser = argparse.ArgumentParser(description="Run a Braintrust evaluation") - parser.add_argument("--project", required=True, help="Project name") - parser.add_argument("--data", help="JSON string of data") - parser.add_argument("--data-file", help="Path to JSON file with data") - parser.add_argument("--experiment", help="Experiment name (optional)") - parser.add_argument( - "--scorer", default="exact", choices=["exact", "factuality"], help="Scorer to use" - ) - args = parser.parse_args() - - load_env() - require_api_key() - - # Load data - if args.data: - data = json.loads(args.data) - elif args.data_file: - with open(args.data_file) as f: - data = json.load(f) - else: - print("Error: Provide --data or --data-file", file=sys.stderr) - sys.exit(1) - - # Ensure data is a list - if not isinstance(data, list): - data = [data] - - # Select scorer - scorers = [Factuality] if args.scorer == "factuality" else [exact_match_scorer] - - # Run eval - print(f"Running evaluation on project: {args.project}") - print(f"Data: {len(data)} items") - print(f"Scorer: {args.scorer}") - - braintrust.Eval( - args.project, - data=lambda: data, - task=simple_task, - scores=scorers, - experiment_name=args.experiment, - ) - - -if __name__ == "__main__": - main()