diff --git a/.github/workflows/docs-agent-eval-ci.yml b/.github/workflows/docs-agent-eval-ci.yml index 8e6d4647e..9d15c55a2 100644 --- a/.github/workflows/docs-agent-eval-ci.yml +++ b/.github/workflows/docs-agent-eval-ci.yml @@ -1,9 +1,12 @@ # Runs scenarios 01+02 (curl + TypeScript SDK) with heuristic + LLM judge. # Sets EVAL_LOCAL_DOCS=1 so the agent reads repo docs under docs/ (not production WebFetch). +# Injects OUTPOST_API_KEY (and related env) for the agent run so smoke tests hit live Outpost; +# step 2 re-runs the saved artifacts deterministically. # Triggers: workflow_dispatch, or push (main) / pull_request when docs / OpenAPI / agent-eval / TS SDK paths change. # Each run bills Anthropic (agent + judge). # Requires repo secrets: ANTHROPIC_API_KEY, EVAL_TEST_DESTINATION_URL, OUTPOST_API_KEY # (OUTPOST_TEST_WEBHOOK_URL uses the same URL as EVAL_TEST_DESTINATION_URL in CI.) +# Env is scoped per step — see each run step's env block (no job-wide secret export). # See docs/agent-evaluation/README.md § CI (recommended slice). name: Docs agent eval (CI slice) @@ -64,6 +67,9 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} EVAL_TEST_DESTINATION_URL: ${{ secrets.EVAL_TEST_DESTINATION_URL }} EVAL_LOCAL_DOCS: "1" + OUTPOST_API_KEY: ${{ secrets.OUTPOST_API_KEY }} + OUTPOST_TEST_WEBHOOK_URL: ${{ secrets.EVAL_TEST_DESTINATION_URL }} + OUTPOST_API_BASE_URL: https://api.outpost.hookdeck.com/2025-07-01 run: ./scripts/ci-eval.sh - name: Execute generated curl + TypeScript artifacts (live Outpost) diff --git a/docs/agent-evaluation/.env.example b/docs/agent-evaluation/.env.example index 9a3f461cc..fe90eeab6 100644 --- a/docs/agent-evaluation/.env.example +++ b/docs/agent-evaluation/.env.example @@ -6,9 +6,8 @@ ANTHROPIC_API_KEY= # Required for Turn 0 template (test webhook URL injected into the prompt) EVAL_TEST_DESTINATION_URL= -# Strongly recommended for a *full* eval: run the agent’s curl/script/app against a real project. -# The harness does not read this key; you (or a future verifier) use it after the run. -# OUTPOST_API_KEY= # required for ./scripts/execute-ci-artifacts.sh after eval:ci; GitHub Actions CI execution step +# Strongly recommended for CI and full local eval: forwarded to the agent sandbox when set. +# OUTPOST_API_KEY= # OUTPOST_API_BASE_URL=https://api.outpost.hookdeck.com/2025-07-01 # OUTPOST_TEST_WEBHOOK_URL=https://hkdk.events/your-source-id # often same as EVAL_TEST_DESTINATION_URL # OUTPOST_CI_PUBLISH_TOPIC=user.created # optional; publish topic for npm run smoke:execute-ci (must exist in project) @@ -36,4 +35,5 @@ EVAL_TEST_DESTINATION_URL= # Scoring is ON by default after each scenario (heuristic + LLM). Opt out: # EVAL_NO_SCORE_HEURISTIC=1 # EVAL_NO_SCORE_LLM=1 +# LLM judge model (default claude-sonnet-4-6 — current Sonnet tier; no newer Sonnet as of 2026-06) # EVAL_SCORE_MODEL=claude-sonnet-4-6 diff --git a/docs/agent-evaluation/README.md b/docs/agent-evaluation/README.md index a732a6724..f0a03fc0e 100644 --- a/docs/agent-evaluation/README.md +++ b/docs/agent-evaluation/README.md @@ -140,6 +140,7 @@ Each scenario run uses one directory: - **`-scenario-NN.eval-aborted.json`** — **SIGTERM** / **SIGINT** before completion (not **SIGKILL**) If **`transcript.json`** is missing, check these files next to **`…/runs/-scenario-NN/`** (same directory as the run folder, not inside it). - **`heuristic-score.json`** / **`llm-score.json`** — by default (unless disabled above) +- **`llm-judge-failure.json`** — when the LLM judge returns unparseable JSON after retries: full raw model text per attempt plus `stop_reason` and token usage (for CI triage) - **Agent-written files** — the SDK **`cwd`** is this directory. Defaults include **`Write`**, **`Edit`**, and **`Bash`** for clones, installs, and generated code. Re-score a finished run without re-invoking the agent — uses **today's** [`src/score-transcript.ts`](src/score-transcript.ts) and **scenario markdown on disk** (so LLM criteria update when you edit **`## Success criteria`**): @@ -210,19 +211,19 @@ For **pull-request or main-branch** automation, run **two** scenarios only: ```sh cd docs/agent-evaluation && npm ci && npm run eval:ci -# or: ./scripts/ci-eval.sh # requires ANTHROPIC_API_KEY + EVAL_TEST_DESTINATION_URL in the environment -# after a successful eval:ci, live Outpost smoke: OUTPOST_API_KEY + OUTPOST_TEST_WEBHOOK_URL ./scripts/execute-ci-artifacts.sh +# or: ./scripts/ci-eval.sh # requires ANTHROPIC_API_KEY, EVAL_TEST_DESTINATION_URL, OUTPOST_API_KEY +# after a successful eval:ci, CI step 2 re-runs artifacts: ./scripts/execute-ci-artifacts.sh ``` `eval:ci` is **`npm run eval -- --scenarios 01,02`**: both **heuristic** checks and the **LLM judge** (grounded in each scenario's **`## Success criteria`**). Skipping the judge would leave you with regex-only signal, which does not encode the product checklist. -**GitHub Actions:** add repository secrets **`ANTHROPIC_API_KEY`**, **`EVAL_TEST_DESTINATION_URL`**, and **`OUTPOST_API_KEY`**. Workflow **`.github/workflows/docs-agent-eval-ci.yml`** runs **`./scripts/ci-eval.sh`** with **`EVAL_LOCAL_DOCS=1`** (agent **reads docs from the repo**), then **`./scripts/execute-ci-artifacts.sh`**: picks the **newest** **`*-scenario-01`** / **`*-scenario-02`** pair from **`results/runs/`**, runs the generated **`.sh`** then **`npx tsx`** on the TypeScript artifact (**`npm install`** in the **02** run dir when **`package.json`** exists). **`OUTPOST_TEST_WEBHOOK_URL`** in CI is set from the same secret as **`EVAL_TEST_DESTINATION_URL`**. Triggers on **`workflow_dispatch`** (manual: Actions → **Docs agent eval (CI slice)** → **Run workflow**, pick branch), pushes to **`main`**, and **pull requests** when **`docs/content/**`**, **`docs/apis/**`**, **`sdks/outpost-typescript/**`**, root **`docs/README.md`** / **`docs/AGENTS.md`**, or **`docs/agent-evaluation/**`** change (GitHub does not allow **`paths`** + **`paths-ignore`** together on the same event, so edits under e.g. **`docs/agent-evaluation/README.md`** also match **`docs/agent-evaluation/**`** and can trigger a run). Uses **`ubuntu-latest`** (Claude Agent SDK needs normal filesystem access — avoid tight sandboxes; see **Permissions / failures** above). **Fork PRs** skip this job (secrets are not available). +**GitHub Actions:** add repository secrets **`ANTHROPIC_API_KEY`**, **`EVAL_TEST_DESTINATION_URL`**, and **`OUTPOST_API_KEY`**. Workflow **`.github/workflows/docs-agent-eval-ci.yml`** scopes env **per step** (no job-wide secret blast radius): step 1 **`./scripts/ci-eval.sh`** gets Anthropic + eval + Outpost creds so the agent can **read repo docs** (`EVAL_LOCAL_DOCS=1`) and **run live smoke tests**; step 2 **`./scripts/execute-ci-artifacts.sh`** gets only Outpost execution vars (including tenant cleanup). Step 2 re-runs the newest **`*-scenario-01`** / **`*-scenario-02`** pair deterministically (generated **`.sh`** then **`npx tsx`** on the TypeScript artifact; **`npm install`** in the **02** run dir when **`package.json`** exists). **`OUTPOST_TEST_WEBHOOK_URL`** in CI is set from the same secret as **`EVAL_TEST_DESTINATION_URL`**. Triggers on **`workflow_dispatch`** (manual: Actions → **Docs agent eval (CI slice)** → **Run workflow**, pick branch), pushes to **`main`**, and **pull requests** when **`docs/content/**`**, **`docs/apis/**`**, **`sdks/outpost-typescript/**`**, root **`docs/README.md`** / **`docs/AGENTS.md`**, or **`docs/agent-evaluation/**`** change (GitHub does not allow **`paths`** + **`paths-ignore`** together on the same event, so edits under e.g. **`docs/agent-evaluation/README.md`** also match **`docs/agent-evaluation/**`** and can trigger a run). Uses **`ubuntu-latest`** (Claude Agent SDK needs normal filesystem access — avoid tight sandboxes; see **Permissions / failures** above). **Fork PRs** skip this job (secrets are not available). The workflow uses **`concurrency: { group: outpost-docs-agent-eval-live-outpost, cancel-in-progress: false }`** so only one run at a time talks to the shared CI Outpost project for execution, and sets **`OUTPOST_CI_CLEANUP_TENANT=customer_acme_001`** so **`execute-ci-artifacts.sh`** **DELETE**s that tenant before the curl script and again on **EXIT** (clears destinations from prior runs and avoids parallel deletes). Override the tenant id only if your Turn 0 fixtures consistently use another id. - **`ANTHROPIC_API_KEY`** — required for the agent and for the **LLM judge** (Success criteria) after each scenario you run. - **`EVAL_TEST_DESTINATION_URL`** — required for Turn 0; same Source URL as `{{TEST_DESTINATION_URL}}` (and, in CI, reused as **`OUTPOST_TEST_WEBHOOK_URL`** for execution). -- **`OUTPOST_API_KEY`** — required for **`execute-ci-artifacts.sh`** and for **GitHub Actions** execution after **`eval:ci`**. For **local** transcript-only runs you can omit it. Put the key in **`docs/agent-evaluation/.env`** (or export); never paste it into chat. +- **`OUTPOST_API_KEY`** — required for **`ci-eval.sh`**, **`execute-ci-artifacts.sh`**, and **GitHub Actions**. The eval runner forwards it (with **`OUTPOST_TEST_WEBHOOK_URL`** / **`OUTPOST_API_BASE_URL`**) into the agent sandbox when set so the model can run live smoke tests; the LLM judge scores execution strictly in that mode. For **local transcript-only** runs you can omit it (the judge applies a missing-env exception). Put the key in **`docs/agent-evaluation/.env`** (or export); never paste it into chat. - **`EVAL_LOCAL_DOCS=1`** — Turn 0 replaces public doc URLs with **absolute paths to repo docs** (primarily **`.mdoc`** under **`docs/content/`**, plus OpenAPI under **`docs/apis/`**; the Turn 0 template itself is **[`hookdeck-outpost-agent-prompt.md`](hookdeck-outpost-agent-prompt.md)**). The agent uses **Read** on **`docs/`** instead of **WebFetch** to production. Use locally when validating unpublished docs; **GitHub Actions** sets this for **`docs-agent-eval-ci.yml`**. - **`EVAL_SKIP_HARNESS_PRE_STEPS=1`** — skip **`git_clone`** (and any future **`preSteps`**) declared in a scenario's **`## Eval harness`** JSON block; useful offline or when the baseline folder is already present. @@ -250,7 +251,9 @@ Changing **`EVAL_PERMISSION_MODE`** is usually unnecessary; widening **`EVAL_TOO ### Transcript vs execution (full pass) -`npm run eval` only captures **what the model produced**; by itself it does **not** call Outpost (transcript review). **`./scripts/execute-ci-artifacts.sh`** (and the **GitHub Actions** workflow's second step) runs the **01** shell + **02** TypeScript outputs against **live** Outpost when **`OUTPOST_API_KEY`** and **`OUTPOST_TEST_WEBHOOK_URL`** are set. +When **`OUTPOST_API_KEY`** is set, `npm run eval` forwards it to the agent sandbox; transcripts can show **live** curl/SDK smoke tests and the **LLM judge** scores execution-style Success criteria from that evidence (no missing-env exception). **`./scripts/execute-ci-artifacts.sh`** (and the **GitHub Actions** workflow's second step) still **re-runs** the saved **01** shell + **02** TypeScript outputs deterministically — a separate gate from the agent session. + +Without **`OUTPOST_API_KEY`**, eval is **transcript-only** for execution: heuristics + LLM still run, but the judge may pass execution rows when failure was solely due to missing env (see [`src/llm-judge.ts`](src/llm-judge.ts)). **Local smoke (no agent):** to verify secrets and the managed API the same way CI does—without depending on a fresh eval transcript—run from **`docs/agent-evaluation/`** with **`OUTPOST_API_KEY`** and **`OUTPOST_TEST_WEBHOOK_URL`** set (e.g. **`source .env`**): diff --git a/docs/agent-evaluation/fixtures/placeholder-values-for-turn0.md b/docs/agent-evaluation/fixtures/placeholder-values-for-turn0.md index b8619c75d..d434eb567 100644 --- a/docs/agent-evaluation/fixtures/placeholder-values-for-turn0.md +++ b/docs/agent-evaluation/fixtures/placeholder-values-for-turn0.md @@ -6,7 +6,7 @@ The **prompt template itself** lives in one place only: Do **not** paste real API keys into chat. Have operators put `OUTPOST_API_KEY` in a project `**.env`\*\* (or another loader), not in the agent transcript. Use a throwaway Hookdeck project when possible. -For `**npm run eval -- --scenario …**` (or `**--scenarios**` / `**--all**`), the runner only needs `**ANTHROPIC_API_KEY**` and `**EVAL_TEST_DESTINATION_URL**`. To score a **full** eval (generated commands/code actually work), you still need `**OUTPOST_API_KEY`** (and usually `**OUTPOST_TEST_WEBHOOK_URL**`) when you **execute** the agent’s output afterward. Optional `**EVAL_LOCAL_DOCS=1`** points Turn 0 at repo paths instead of live `{{DOCS_URL}}` links. +For `**npm run eval -- --scenario …**` (or `**--scenarios**` / `**--all**`), the runner requires `**ANTHROPIC_API_KEY**` and `**EVAL_TEST_DESTINATION_URL**`. For **CI** and **full** evals (agent runs live smoke tests + strict LLM execution scoring), also set `**OUTPOST_API_KEY**` (and usually `**OUTPOST_TEST_WEBHOOK_URL**` — defaults to `EVAL_TEST_DESTINATION_URL` in `ci-eval.sh`). Optional `**EVAL_LOCAL_DOCS=1`** points Turn 0 at repo paths instead of live `{{DOCS_URL}}` links. --- diff --git a/docs/agent-evaluation/hookdeck-outpost-agent-prompt.md b/docs/agent-evaluation/hookdeck-outpost-agent-prompt.md index 732607d3a..51a06612f 100644 --- a/docs/agent-evaluation/hookdeck-outpost-agent-prompt.md +++ b/docs/agent-evaluation/hookdeck-outpost-agent-prompt.md @@ -104,6 +104,7 @@ Goal: tenant → **one destination** (often webhook to `{{TEST_DESTINATION_URL}} - Default to **curl** when they want the absolute minimum and did not name a language. - When they name **TypeScript**, **Python**, or **Go**, produce **only** what that language’s **quickstart** describes—typically **one file** (plus `package.json` / `go.mod` / venv if the quickstart needs it), not a full application tree. - Ask only for env vars and details the quickstart still needs. +- **Verify delivery the quickstart way:** after publish, print the **event id** (or 202 success) and point the operator to **Hookdeck Console** / dashboard **logs** for the test webhook URL. **Do not** add an immediate `events.list` / `GET …/events` check that throws if the event is missing on the first try — publish is **202 (accepted)** and observability APIs are **eventually consistent** (see **{{DOCS_URL}}/concepts#publish-acceptance-vs-observability**). Reserve events/attempts listing and retry UX for **Building your own UI** / full-stack paths, not the one-file quickstart. ### New minimal application @@ -145,6 +146,7 @@ Apply **only** the items below that fit the task; **skip** any that do not apply - [ ] **Ran** the smallest end-to-end check that fits this task (e.g. run the script or shell flow once, exercise one new API path, or smoke the UI/API flow you added) and saw a clear success signal (e.g. event id, HTTP 2xx, or expected output). - [ ] **Secrets:** The platform Outpost API key remains **server-side** / **environment** only — not in client bundles, not hard-coded in committed source. - [ ] **Repeatable:** Env vars, how to run, and how to verify with the test destination above are stated briefly (README, comments, or chat — match the task size; a one-file script may need only inline or chat notes). +- [ ] **Quick path verification:** If this is a **quickstart-shaped** script (one file / curl flow), success is **publish accepted (202 / event id printed)** plus **Hookdeck Console** (or dashboard logs) for the test webhook — **not** a hard-fail `events.list` / `GET …/events` immediately after publish. If you must list events in code, **poll with retries** per **{{DOCS_URL}}/concepts#publish-acceptance-vs-observability**; do not throw on the first empty list. **When editing an existing application repository (Existing application or equivalent):** diff --git a/docs/agent-evaluation/results/README.md b/docs/agent-evaluation/results/README.md index 9fe1615cc..c6ca26655 100644 --- a/docs/agent-evaluation/results/README.md +++ b/docs/agent-evaluation/results/README.md @@ -36,7 +36,7 @@ npm run score -- --run results/runs/-scenario-NN --write npm run score -- --run results/runs/-scenario-NN --llm --write ``` -**Execution** (curl/SDK against live Outpost with `OUTPOST_API_KEY`) is **not** recorded in these JSON files. Use **`../scripts/execute-ci-artifacts.sh`** after **`eval:ci`**, or the second step in **`.github/workflows/docs-agent-eval-ci.yml`**, and the **Execution (full pass)** rows in `[../scenarios/](../scenarios/)` for human notes. +**Execution** when `OUTPOST_API_KEY` is set: the agent may run live smoke tests during eval (evidence in `transcript.json`; **LLM judge** scores execution-style criteria). **`execute-ci-artifacts.sh`** (CI step 2) re-runs saved artifacts deterministically. Without the key, execution is transcript-only / manual — see **Execution (full pass)** rows in `[../scenarios/](../scenarios/)`. --- diff --git a/docs/agent-evaluation/scripts/ci-eval.sh b/docs/agent-evaluation/scripts/ci-eval.sh index 980442967..677617da2 100755 --- a/docs/agent-evaluation/scripts/ci-eval.sh +++ b/docs/agent-evaluation/scripts/ci-eval.sh @@ -1,11 +1,12 @@ #!/usr/bin/env bash # CI-friendly agent eval: scenarios 01+02 with heuristic + LLM judge (Success criteria from each scenario .md). # -# Required secrets (e.g. GitHub Actions): ANTHROPIC_API_KEY, EVAL_TEST_DESTINATION_URL +# Required secrets (e.g. GitHub Actions): ANTHROPIC_API_KEY, EVAL_TEST_DESTINATION_URL, OUTPOST_API_KEY # Optional: same vars in docs/agent-evaluation/.env for local runs. # # Scenarios: 01 = curl quickstart shape; 02 = TypeScript SDK script. See README § CI. -# After success, run ./scripts/execute-ci-artifacts.sh with OUTPOST_API_KEY + OUTPOST_TEST_WEBHOOK_URL for live Outpost (CI does this automatically). +# OUTPOST_API_KEY is forwarded to the agent sandbox so it can run smoke tests during eval:ci. +# After success, ./scripts/execute-ci-artifacts.sh re-runs the saved artifacts (CI step 2). set -euo pipefail ROOT="$(cd "$(dirname "$0")/.." && pwd)" @@ -19,5 +20,13 @@ if [[ -z "${EVAL_TEST_DESTINATION_URL:-}" ]]; then echo "ci-eval: EVAL_TEST_DESTINATION_URL is not set" >&2 exit 1 fi +if [[ -z "${OUTPOST_API_KEY:-}" ]]; then + echo "ci-eval: OUTPOST_API_KEY is not set (required so the agent can run live Outpost smoke tests)" >&2 + exit 1 +fi + +export OUTPOST_TEST_WEBHOOK_URL="${OUTPOST_TEST_WEBHOOK_URL:-${EVAL_TEST_DESTINATION_URL:-}}" +: "${OUTPOST_API_BASE_URL:=https://api.outpost.hookdeck.com/2025-07-01}" +export OUTPOST_API_BASE_URL OUTPOST_TEST_WEBHOOK_URL exec npm run eval:ci diff --git a/docs/agent-evaluation/src/llm-judge.ts b/docs/agent-evaluation/src/llm-judge.ts index 5c0d17c74..34d2dd582 100644 --- a/docs/agent-evaluation/src/llm-judge.ts +++ b/docs/agent-evaluation/src/llm-judge.ts @@ -3,13 +3,42 @@ * Feeds scenario Success criteria + assistant transcript; returns structured JSON from the model. */ -import { readFile } from "node:fs/promises"; +import { readFile, writeFile } from "node:fs/promises"; import { basename, dirname, join } from "node:path"; import { extractTranscriptScoringText } from "./score-transcript.js"; const ANTHROPIC_MESSAGES_URL = "https://api.anthropic.com/v1/messages"; +/** Latest Sonnet tier (Feb 2026); override with EVAL_SCORE_MODEL. */ const DEFAULT_SCORE_MODEL = "claude-sonnet-4-6"; const MAX_TRANSCRIPT_CHARS = 180_000; +const MAX_JUDGE_ATTEMPTS = 3; +const JUDGE_MAX_TOKENS = 8192; + +interface AnthropicJudgeResponse { + readonly content?: readonly { type?: string; text?: string }[]; + readonly stop_reason?: string; + readonly usage?: { + readonly input_tokens?: number; + readonly output_tokens?: number; + }; +} + +export interface JudgeAttemptDiagnostics { + readonly attempt: number; + readonly stop_reason?: string; + readonly input_tokens?: number; + readonly output_tokens?: number; + readonly raw_text_length: number; + readonly raw_text: string; + readonly parse_error: string; +} + +export interface LlmJudgeFailureArtifact { + readonly failedAt: string; + readonly model: string; + readonly runFile: string; + readonly attempts: readonly JudgeAttemptDiagnostics[]; +} export interface LlmCriterionJudgment { readonly criterion: string; @@ -64,7 +93,13 @@ function parseJudgeJson(text: string): Omit; + let parsed: Record; + try { + parsed = JSON.parse(raw) as Record; + } catch (parse_err) { + const detail = parse_err instanceof Error ? parse_err.message : String(parse_err); + throw new Error(`JSON.parse failed: ${detail}`); + } const overall = Boolean(parsed.overall_transcript_pass); const criteriaIn = parsed.criteria; const criteria: LlmCriterionJudgment[] = []; @@ -99,10 +134,16 @@ function parseJudgeJson(text: string): Omit { + const path = judgeFailureArtifactPath(run_path); + await writeFile(path, `${JSON.stringify(artifact, null, 2)}\n`, "utf8"); + return path; +} + +function logJudgeAttempt( + attempt: number, + max_attempts: number, + api_body: AnthropicJudgeResponse, + raw_text: string, +): void { + const usage = api_body.usage; + console.error( + `LLM judge attempt ${attempt}/${max_attempts}: stop_reason=${api_body.stop_reason ?? "unknown"} ` + + `input_tokens=${usage?.input_tokens ?? "?"} output_tokens=${usage?.output_tokens ?? "?"} ` + + `raw_chars=${raw_text.length}`, + ); +} + +async function callAnthropicJudge(options: { + readonly api_key: string; + readonly model: string; + readonly system: string; + readonly user_content: string; + readonly retry_note?: string; +}): Promise<{ readonly text: string; readonly body: AnthropicJudgeResponse }> { + const user_content = options.retry_note + ? `${options.user_content}\n\n---\n\n${options.retry_note}` + : options.user_content; + + const res = await fetch(ANTHROPIC_MESSAGES_URL, { + method: "POST", + headers: { + "content-type": "application/json", + "x-api-key": options.api_key, + "anthropic-version": "2023-06-01", + }, + body: JSON.stringify({ + model: options.model, + max_tokens: JUDGE_MAX_TOKENS, + system: options.system, + messages: [{ role: "user", content: user_content }], + }), + }); -Eval-harness / transcript environment: The assistant may run Bash (e.g. npx tsx, shell quickstarts) inside an automated eval where live secrets such as OUTPOST_API_KEY are often NOT injected, even when a later CI step verifies artifacts with real keys. If the transcript shows the assistant attempted that smoke run and it failed ONLY because required env vars or secrets were missing or empty (clear message: explicit throw, documented "set OUTPOST_API_KEY", 401/403 from missing auth, tool_result text stating unset variable, etc.)—and the written artifacts otherwise match the scenario (SDK usage, endpoints, fail-fast checks, README)—then treat Success-criteria rows about "execution", "runs to completion", or "live API" as PASS for that reason. Keep execution_in_transcript.pass = null (you still did not run code yourself). Set overall_transcript_pass to true when every criteria[] entry passes under these rules; do not fail the whole judgment solely because the eval transcript lacked keys. Do NOT use this exception when the script was never run, the error is vague, or failure likely reflects bugs, syntax errors, wrong API usage, or misconfiguration unrelated to missing env in the sandbox.`; + if (!res.ok) { + const err_text = await res.text(); + throw new Error(`Anthropic API ${res.status}: ${err_text.slice(0, 2000)}`); + } + + const body = (await res.json()) as AnthropicJudgeResponse; + const text_block = body.content?.find((c) => c.type === "text"); + const text = text_block?.text ?? ""; + return { text, body }; +} export async function llmJudgeRun(options: { readonly runPath: string; @@ -148,39 +270,69 @@ ${transcript} --- -Judge the transcript against the Success criteria. Remember: execution (running curl or scripts against a live API) is NOT evidenced by you unless the transcript shows successful HTTP/tool outcomes; normally set execution_in_transcript.pass to null. If the transcript shows a run attempt failed only because OUTPOST_API_KEY or other required env was missing in the eval sandbox, apply the harness exception in your system instructions for execution-style criteria—do not mark overall_transcript_pass false for that alone.`; +${buildJudgeUserTail()}`; - const res = await fetch(ANTHROPIC_MESSAGES_URL, { - method: "POST", - headers: { - "content-type": "application/json", - "x-api-key": options.apiKey, - "anthropic-version": "2023-06-01", - }, - body: JSON.stringify({ + const system = buildJudgeSystem(); + const attempt_diagnostics: JudgeAttemptDiagnostics[] = []; + let judged: ReturnType | undefined; + + for (let attempt = 1; attempt <= MAX_JUDGE_ATTEMPTS; attempt++) { + const retry_note = + attempt === 1 + ? undefined + : `IMPORTANT: Your previous response was not valid complete JSON (see prior attempt diagnostics). ` + + `Output ONLY a single complete JSON object matching the schema in your system instructions — ` + + `no markdown fences, no commentary. Ensure the response ends with closing braces and includes summary.`; + + const { text, body } = await callAnthropicJudge({ + api_key: options.apiKey, model, - max_tokens: 8192, - system: JUDGE_SYSTEM, - messages: [{ role: "user", content: userContent }], - }), - }); + system, + user_content: userContent, + retry_note, + }); - if (!res.ok) { - const errText = await res.text(); - throw new Error(`Anthropic API ${res.status}: ${errText.slice(0, 2000)}`); + logJudgeAttempt(attempt, MAX_JUDGE_ATTEMPTS, body, text); + + try { + judged = parseJudgeJson(text); + break; + } catch (parse_err) { + const parse_error = + parse_err instanceof Error ? parse_err.message : String(parse_err); + attempt_diagnostics.push({ + attempt, + stop_reason: body.stop_reason, + input_tokens: body.usage?.input_tokens, + output_tokens: body.usage?.output_tokens, + raw_text_length: text.length, + raw_text: text, + parse_error, + }); + if (attempt < MAX_JUDGE_ATTEMPTS) { + console.error( + `LLM judge attempt ${attempt} parse failed (${parse_error}); retrying…`, + ); + } + } } - const body = (await res.json()) as { - content?: readonly { type?: string; text?: string }[]; - }; - const textBlock = body.content?.find((c) => c.type === "text"); - const text = textBlock?.text ?? ""; - let judged: ReturnType; - try { - judged = parseJudgeJson(text); - } catch { + if (!judged) { + const last = attempt_diagnostics[attempt_diagnostics.length - 1]; + const failure_artifact: LlmJudgeFailureArtifact = { + failedAt: new Date().toISOString(), + model, + runFile: options.runPath, + attempts: attempt_diagnostics, + }; + const artifact_path = await writeJudgeFailureArtifact(options.runPath, failure_artifact); + console.error(`Wrote ${artifact_path} (full judge raw responses from ${attempt_diagnostics.length} attempts)`); + throw new Error( - `Judge did not return parseable JSON. First 800 chars:\n${text.slice(0, 800)}`, + `Judge did not return parseable JSON after ${MAX_JUDGE_ATTEMPTS} attempts. ` + + `Last stop_reason=${last?.stop_reason ?? "unknown"} ` + + `output_tokens=${last?.output_tokens ?? "?"} raw_chars=${last?.raw_text_length ?? 0}. ` + + `Full responses: ${artifact_path}. First 800 chars of last attempt:\n${(last?.raw_text ?? "").slice(0, 800)}`, ); } diff --git a/docs/agent-evaluation/src/run-agent-eval.ts b/docs/agent-evaluation/src/run-agent-eval.ts index dabf6d877..7cc0dbfe0 100644 --- a/docs/agent-evaluation/src/run-agent-eval.ts +++ b/docs/agent-evaluation/src/run-agent-eval.ts @@ -1,8 +1,9 @@ /** * Automated Outpost onboarding agent evals via the Claude Agent SDK. * - * Requires ANTHROPIC_API_KEY (and EVAL_TEST_DESTINATION_URL). Does not call Outpost. - * For a full eval, humans (or a separate verifier) run generated artifacts using OUTPOST_API_KEY — see README. + * Requires ANTHROPIC_API_KEY and EVAL_TEST_DESTINATION_URL. When OUTPOST_API_KEY is set, + * it is forwarded to the agent sandbox so smoke tests can hit live Outpost during eval. + * CI also runs execute-ci-artifacts.sh to re-run saved artifacts — see README. * * @see https://platform.claude.com/docs/en/agent-sdk/overview */ @@ -786,6 +787,9 @@ Environment: Values can be set in docs/agent-evaluation/.env (loaded automatically) or exported in the shell. ANTHROPIC_API_KEY Required EVAL_TEST_DESTINATION_URL Required — Hookdeck Console Source URL (fed into {{TEST_DESTINATION_URL}}) + OUTPOST_API_KEY Required for ci-eval.sh / GitHub Actions; forwarded to the agent when set (live smoke tests). Optional for local transcript-only runs without execution. + OUTPOST_TEST_WEBHOOK_URL Optional — defaults to EVAL_TEST_DESTINATION_URL in ci-eval.sh + OUTPOST_API_BASE_URL Optional — managed default when unset in ci-eval.sh EVAL_API_BASE_URL Optional (default: managed production URL) EVAL_TOPICS_LIST Optional EVAL_DOCS_URL Optional (ignored for Documentation links when EVAL_LOCAL_DOCS is set; still used to derive the default EVAL_LLMS_FULL_URL unless that is set) diff --git a/docs/content/concepts.mdoc b/docs/content/concepts.mdoc index 4d4944a59..eff9a74ec 100644 --- a/docs/content/concepts.mdoc +++ b/docs/content/concepts.mdoc @@ -67,6 +67,17 @@ Configure retry behavior with `MAX_RETRY_LIMIT`, `RETRY_INTERVAL_SECONDS`, and ` Read more about [event delivery and retries](/docs/outpost/features/event-delivery). +### Publish acceptance vs observability {#publish-acceptance-vs-observability} + +When you call **publish** (`POST /publish` or the SDK equivalent), Outpost responds with **HTTP 202** and an **event id** once it has **accepted** the event for processing. That response confirms the event entered the pipeline — it does **not** guarantee the event is already listed in the **events** or **attempts** APIs, or that the webhook has been delivered. + +If you list tenant events (`GET …/events` with a `tenant_id` query parameter) or load delivery attempts immediately after publish, the result may be empty for a short time while Outpost indexes the event and schedules delivery. That is normal eventual consistency, not a failed publish. + +How you verify success depends on what you are building: + +- **Publish-and-confirm flows** (including the quickstarts in this documentation): treat **202 + event id** as publish success in your code, then confirm delivery in **Hookdeck Console**, your project's dashboard **logs**, or by watching your webhook endpoint. +- **Activity or audit UIs** in your product: **poll** the events or attempts APIs with backoff until the event appears — see [Building your own UI](/docs/outpost/guides/building-your-own-ui) for patterns. + ## Portal Outpost includes a built-in self-service portal for your tenants to manage their destinations, view events, and retry failed deliveries. The portal is accessed via a short-lived JWT token generated by the API. diff --git a/docs/content/quickstarts/hookdeck-outpost-curl.mdoc b/docs/content/quickstarts/hookdeck-outpost-curl.mdoc index 2b8c3e7f7..21f4d9169 100644 --- a/docs/content/quickstarts/hookdeck-outpost-curl.mdoc +++ b/docs/content/quickstarts/hookdeck-outpost-curl.mdoc @@ -88,6 +88,10 @@ curl --request POST "$OUTPOST_API_BASE_URL/publish" \ On success, the response status is **202** — the event was accepted for delivery. +{% callout %} +**Publish is asynchronous (HTTP 202).** A **202** response means Outpost **accepted** the event and returned an **event id** in the response body. It does not mean the event is already listed in the **events** or **attempts** APIs, or that your webhook has been delivered — processing and delivery continue in the background. Note the event id from the response and confirm delivery using the steps below. See [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability) for how this differs from listing events or attempts in your own integrations. +{% /callout %} + ## HTTP status codes (quick reference) | Step | Request | Success status | @@ -98,8 +102,34 @@ On success, the response status is **202** — the event was accepted for delive ## Verify delivery -- In **Hookdeck Console**, inspect the connection or destination you used (for example the Source you created) and confirm the webhook request and payload look correct. -- In the **Hookdeck Dashboard**, open **your Outpost project** and review **logs** (and any deliveries or event views your project exposes) to confirm the event was processed and delivered. +After publish, parse the **event id** from the **202** response body (for example with `jq`). + +- In **Hookdeck Console**, open the Source or connection you used for the webhook URL and confirm the request arrived with the expected payload. +- In the **Hookdeck Dashboard**, open **your Outpost project** and review **logs** to confirm the event was processed and delivered. + +Calling `GET …/events` with `tenant_id` (and optional `topic`) immediately after publish may return an empty list for a short time while Outpost indexes the event. That is normal eventual consistency, not a failed publish — you do not need the events API to finish this walkthrough. If you are building an activity or audit screen in your product, poll with retries instead of a single request; see [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability). + +### Optional: poll the events API + +Use this pattern when your application must confirm an event appears in the events API (for example while building an activity feed): + +```sh +EVENT_ID="" + +for attempt in $(seq 1 8); do + if curl -s -G "$OUTPOST_API_BASE_URL/events" \ + --data-urlencode "tenant_id=$TENANT_ID" \ + --data-urlencode "topic=user.created" \ + -H "Authorization: Bearer $OUTPOST_API_KEY" \ + | grep -q "$EVENT_ID"; then + echo "Verified in events list: $EVENT_ID" + break + fi + sleep 1 +done +``` + +For this quickstart, Console and dashboard logs are sufficient. ## Next steps diff --git a/docs/content/quickstarts/hookdeck-outpost-go.mdoc b/docs/content/quickstarts/hookdeck-outpost-go.mdoc index 411dfeca1..051c40388 100644 --- a/docs/content/quickstarts/hookdeck-outpost-go.mdoc +++ b/docs/content/quickstarts/hookdeck-outpost-go.mdoc @@ -143,6 +143,10 @@ func main() { } ``` +{% callout %} +**Publish is asynchronous (HTTP 202).** A successful `Publish` call means Outpost **accepted** the event and returned an **event id**. It does not mean the event is already visible in the **events** or **attempts** APIs, or that your webhook has been delivered — processing and delivery continue in the background. When the program finishes, note the printed event id and confirm delivery using the steps below. See [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability) for how this differs from listing events or attempts in your own code. +{% /callout %} + Run: ```sh @@ -153,9 +157,50 @@ For all topics on that destination, use `components.CreateTopicsTopicsEnum(compo ## Verify delivery -- In **Hookdeck Console**, confirm the webhook hit your test URL. +When the program completes, you should see a published event id in the terminal. + +- In **Hookdeck Console**, open the Source or connection you used for `OUTPOST_TEST_WEBHOOK_URL` and confirm the webhook request arrived with the expected payload. - In the **Hookdeck Dashboard**, open **your Outpost project** and review **logs** to confirm the event was processed and delivered. +Listing events with `s.Events.List` (or `GET …/events` with `tenant_id`) immediately after publish may return an empty list for a short time while Outpost indexes the event. That is normal eventual consistency, not a failed publish — you do not need the events API to finish this walkthrough. If you are building an activity or audit screen in your product, poll with retries instead of a single list call; see [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability). + +### Optional: poll the events API + +Add after the successful publish block in `main` (add `"time"` to your imports if needed). Use this when your application must confirm an event appears in the events API (for example while building an activity feed): + +```go +publishedID := pubRes.GetPublishResponse().GetID() +var found bool +for attempt := 1; attempt <= 8 && !found; attempt++ { + listRes, err := s.Events.List(ctx, operations.ListEventsRequest{ + TenantID: []string{tenantID}, + Topic: []string{topic}, + }) + if err != nil { + log.Fatal(err) + } + if listRes.EventPaginatedResult != nil { + for _, ev := range listRes.EventPaginatedResult.GetModels() { + if ev.GetID() == publishedID { + fmt.Println("Verified in events list:", publishedID) + found = true + break + } + } + } + if !found { + time.Sleep(time.Second) + } +} +if !found { + fmt.Println( + "Event not visible in the events list yet; check Hookdeck Console for delivery.", + ) +} +``` + +For this quickstart, Console and dashboard logs are sufficient. + ## Next steps - [Destination types](/docs/outpost/overview#supported-destinations) diff --git a/docs/content/quickstarts/hookdeck-outpost-python.mdoc b/docs/content/quickstarts/hookdeck-outpost-python.mdoc index a57733e9b..a9c0e63e2 100644 --- a/docs/content/quickstarts/hookdeck-outpost-python.mdoc +++ b/docs/content/quickstarts/hookdeck-outpost-python.mdoc @@ -114,6 +114,10 @@ published = client.publish( print("Published event id:", published.id) ``` +{% callout %} +**Publish is asynchronous (HTTP 202).** A successful `publish` means Outpost **accepted** the event and returned an **event id**. It does not mean the event is already visible in the **events** or **attempts** APIs, or that your webhook has been delivered — processing and delivery continue in the background. When the script finishes, note the printed event id and confirm delivery using the steps below. See [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability) for how this differs from listing events or attempts in your own code. +{% /callout %} + Run: ```sh @@ -124,9 +128,42 @@ Use `topics: ["*"]` on the destination to receive all configured topics. ## Verify delivery -- In **Hookdeck Console**, confirm the webhook hit your test URL. +When the script completes, you should see a published event id in the terminal. + +- In **Hookdeck Console**, open the Source or connection you used for `OUTPOST_TEST_WEBHOOK_URL` and confirm the webhook request arrived with the expected payload. - In the **Hookdeck Dashboard**, open **your Outpost project** and review **logs** to confirm the event was processed and delivered. +Listing events with `client.events.list` (or `GET …/events` with `tenant_id`) immediately after publish may return an empty list for a short time while Outpost indexes the event. That is normal eventual consistency, not a failed publish — you do not need the events API to finish this walkthrough. If you are building an activity or audit screen in your product, poll with retries instead of a single list call; see [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability). + +### Optional: poll the events API + +Use this pattern when your application needs to confirm an event appears in the events API (for example while building an activity feed): + +```python +import time + +event_id = published.id +found = None +for attempt in range(1, 9): + res = client.events.list( + request={"tenant_id": tenant_id, "topic": topic}, + ) + models = res.models or [] + found = next((e for e in models if e.id == event_id), None) + if found: + break + time.sleep(1) + +if found: + print("Verified in events list:", found.id) +else: + print( + "Event not visible in the events list yet; check Hookdeck Console for delivery.", + ) +``` + +For this quickstart, Console and dashboard logs are sufficient. + ## Next steps - [Destination types](/docs/outpost/overview#supported-destinations) diff --git a/docs/content/quickstarts/hookdeck-outpost-typescript.mdoc b/docs/content/quickstarts/hookdeck-outpost-typescript.mdoc index f9832db03..10bbaa481 100644 --- a/docs/content/quickstarts/hookdeck-outpost-typescript.mdoc +++ b/docs/content/quickstarts/hookdeck-outpost-typescript.mdoc @@ -115,6 +115,10 @@ const published = await outpost.publish({ console.log("Published event id:", published.id); ``` +{% callout %} +**Publish is asynchronous (HTTP 202).** A successful `publish` means Outpost **accepted** the event and returned an **event id**. It does not mean the event is already visible in the **events** or **attempts** APIs, or that your webhook has been delivered — processing and delivery continue in the background. When the script finishes, note the printed event id and confirm delivery using the steps below. See [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability) for how this differs from listing events or attempts in your own code. +{% /callout %} + Run: ```sh @@ -125,9 +129,36 @@ To subscribe the destination to all topics, pass `topics: ["*"]` instead of `[to ## Verify delivery -- In **Hookdeck Console**, inspect the Source or connection you used for `OUTPOST_TEST_WEBHOOK_URL` and confirm the webhook request arrived as expected. +When the script completes, you should see a published event id in the terminal. + +- In **Hookdeck Console**, open the Source or connection you used for `OUTPOST_TEST_WEBHOOK_URL` and confirm the webhook request arrived with the expected payload. - In the **Hookdeck Dashboard**, open **your Outpost project** and review **logs** to confirm the event was processed and delivered. +Listing events with `outpost.events.list` (or `GET …/events` with `tenant_id`) immediately after publish may return an empty list for a short time while Outpost indexes the event. That is normal eventual consistency, not a failed publish — you do not need the events API to finish this walkthrough. If you are building an activity or audit screen in your product, poll with retries instead of a single list call; see [Publish acceptance vs observability](/docs/outpost/concepts#publish-acceptance-vs-observability). + +### Optional: poll the events API + +Use this pattern when your application needs to confirm an event appears in the events API (for example while building an activity feed): + +```typescript +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); +let found; +for (let attempt = 1; attempt <= 8 && !found; attempt++) { + const { models: events } = await outpost.events.list({ tenantId, topic }); + found = events?.find((e) => e.id === published.id); + if (!found) await sleep(1000); +} +if (!found) { + console.warn( + "Event not visible in the events list yet; check Hookdeck Console for delivery.", + ); +} else { + console.log("Verified in events list:", found.id); +} +``` + +For this quickstart, Console and dashboard logs are sufficient. + ## Next steps - [Destination types](/docs/outpost/overview#supported-destinations)