Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/docs-agent-eval-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ jobs:
run: ./scripts/execute-ci-artifacts.sh

# Transcripts, heuristic + LLM scores, generated scripts — present after eval; execute step may add logs in-place.
- name: Redact secrets in eval artifacts (best effort)
if: always()
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
EVAL_TEST_DESTINATION_URL: ${{ secrets.EVAL_TEST_DESTINATION_URL }}
OUTPOST_API_KEY: ${{ secrets.OUTPOST_API_KEY }}
OUTPOST_TEST_WEBHOOK_URL: ${{ secrets.EVAL_TEST_DESTINATION_URL }}
run: node --import tsx scripts/redact-eval-artifacts.ts

- name: Upload agent eval outputs (debug)
if: always()
uses: actions/upload-artifact@v7
Expand Down
4 changes: 2 additions & 2 deletions docs/agent-evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ npm run viz:trajectory -- --run results/runs/<stamp>-scenario-01/transcript.json

By default the HTML is written beside **`transcript.json`** (or **`--out`** on the viz CLI). Open the file in a browser: click a row to highlight it and set **`#s=<step>`** in the URL for a quick bookmark. The page can **filter by tool kind**, **narrow Read rows to documentation vs code paths** (by extension), and **require doc heuristics** (reference, OpenAPI, quickstart, published URL, etc.) so you can focus on documentation-looking steps.

**Privacy:** transcripts and tool results may contain secrets; the generator applies light redaction to previews, but **treat HTML output as sensitive** and do not commit real run artifacts.
**Privacy:** transcripts and tool results may contain secrets. The runner **redacts best-effort** when writing `transcript.json`, `llm-score.json`, `llm-judge-failure.json`, and eval failure sidecars (`src/redact-secrets.ts` — API keys and test webhook URLs from env); CI runs **`scripts/redact-eval-artifacts.ts`** with the same env before uploading `results/runs` artifacts. Trajectory HTML applies light redaction to previews only. **Treat all run outputs as sensitive** — redaction is not a guarantee — and do not commit real run artifacts.

**Regression check:** `npm run test:trajectory` — asserts step extraction and turn indexing against a tiny fixture.
**Regression checks:** `npm run test` (or individually: `npm run test:trajectory`, `npm run test:redact-secrets`) — trajectory step extraction and secret redaction for eval artifacts.

Legacy flat files `*-scenario-NN.json` next to `runs/` are still accepted by **`npm run score`** for older runs.

Expand Down
4 changes: 3 additions & 1 deletion docs/agent-evaluation/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
"score": "node --import tsx src/score-eval.ts",
"viz:trajectory": "node --import tsx src/generate-trajectory-html.ts",
"typecheck": "tsc --noEmit",
"test:trajectory": "node --import tsx src/trajectory-fixture-smoke.ts"
"test": "npm run test:trajectory && npm run test:redact-secrets",
"test:trajectory": "node --import tsx src/trajectory-fixture-smoke.ts",
"test:redact-secrets": "node --import tsx src/redact-secrets.test.ts"
},
"engines": {
"node": ">=18"
Expand Down
62 changes: 62 additions & 0 deletions docs/agent-evaluation/scripts/redact-eval-artifacts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env -S node --import tsx
/**
* Best-effort in-place redaction of JSON under results/runs/ before CI artifact upload.
* See src/redact-secrets.ts.
*/

import { readdir, readFile, stat, writeFile } from "node:fs/promises";
import { join } from "node:path";
import { fileURLToPath } from "node:url";
import { redactEvalArtifactJson } from "../src/redact-secrets.js";

const EVAL_ROOT = join(fileURLToPath(new URL(".", import.meta.url)), "..");
const RUNS_DIR = join(EVAL_ROOT, "results", "runs");

async function walkJsonFiles(dir: string): Promise<string[]> {
const entries = await readdir(dir, { withFileTypes: true });
const files: string[] = [];
for (const entry of entries) {
const path = join(dir, entry.name);
if (entry.isDirectory()) {
files.push(...(await walkJsonFiles(path)));
} else if (entry.isFile() && entry.name.endsWith(".json")) {
files.push(path);
}
}
return files;
}

async function main(): Promise<void> {
try {
await stat(RUNS_DIR);
} catch {
console.error("redact-eval-artifacts: no results/runs directory — nothing to do");
return;
}

const files = await walkJsonFiles(RUNS_DIR);
let updated = 0;
for (const path of files) {
const raw = await readFile(path, "utf8");
let parsed: unknown;
try {
parsed = JSON.parse(raw) as unknown;
} catch {
console.error(`redact-eval-artifacts: skip invalid JSON ${path}`);
continue;
}
const redacted = redactEvalArtifactJson(parsed);
if (redacted !== raw && redacted !== raw.trimEnd() + "\n") {
await writeFile(path, redacted, "utf8");
updated++;
}
}
console.error(
`redact-eval-artifacts: scanned ${files.length} JSON file(s), redacted ${updated}`,
);
}

main().catch((err) => {
console.error(err);
process.exit(1);
});
10 changes: 9 additions & 1 deletion docs/agent-evaluation/src/llm-judge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import { readFile, writeFile } from "node:fs/promises";
import { basename, dirname, join } from "node:path";
import { extractTranscriptScoringText } from "./score-transcript.js";
import { redactSecretsForArtifact } from "./redact-secrets.js";

const ANTHROPIC_MESSAGES_URL = "https://api.anthropic.com/v1/messages";
/** Latest Sonnet tier (Feb 2026); override with EVAL_SCORE_MODEL. */
Expand Down Expand Up @@ -183,7 +184,14 @@ async function writeJudgeFailureArtifact(
artifact: LlmJudgeFailureArtifact,
): Promise<string> {
const path = judgeFailureArtifactPath(run_path);
await writeFile(path, `${JSON.stringify(artifact, null, 2)}\n`, "utf8");
const redacted: LlmJudgeFailureArtifact = {
...artifact,
attempts: artifact.attempts.map((a) => ({
...a,
raw_text: redactSecretsForArtifact(a.raw_text),
})),
};
await writeFile(path, `${JSON.stringify(redacted, null, 2)}\n`, "utf8");
return path;
}

Expand Down
210 changes: 210 additions & 0 deletions docs/agent-evaluation/src/redact-secrets.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
/**
* Unit-style assertions for secret redaction (no test runner dependency).
*
* npm run test:redact-secrets
*/

import {
collectEnvSecretValues,
redactEvalArtifactJson,
redactSecrets,
redactSecretsForArtifact,
} from "./redact-secrets.js";

function assert(cond: boolean, msg: string): void {
if (!cond) {
throw new Error(`Assertion failed: ${msg}`);
}
}

function assertNotIncludes(haystack: string, needle: string, msg: string): void {
if (haystack.includes(needle)) {
throw new Error(`Assertion failed: ${msg} (found "${needle}")`);
}
}

function withEnv(
values: Record<string, string | undefined>,
fn: () => void,
): void {
const keys = Object.keys(values);
const prior = new Map<string, string | undefined>();
for (const key of keys) {
prior.set(key, process.env[key]);
const value = values[key];
if (value === undefined) {
delete process.env[key];
} else {
process.env[key] = value;
}
}
try {
fn();
} finally {
for (const key of keys) {
const value = prior.get(key);
if (value === undefined) {
delete process.env[key];
} else {
process.env[key] = value;
}
}
}
}

function testRedactSecretsPatterns(): void {
const fake_bearer_token = "faketokenabcdefghijklmnopqrstuvw";
const bearer =
`curl -H "Authorization: Bearer ${fake_bearer_token}" https://api.example.com`;
const redacted_bearer = redactSecrets(bearer);
assertNotIncludes(redacted_bearer, fake_bearer_token, "Bearer token redacted");
assert(redacted_bearer.includes("[REDACTED]"), "Bearer placeholder present");

const env_line = "OUTPOST_API_KEY=opst_live_secret_value_12345678";
const redacted_env = redactSecrets(env_line);
assertNotIncludes(redacted_env, "opst_live_secret", "env KEY= line redacted");
assert(redacted_env.includes("OUTPOST_API_KEY=[REDACTED]"), "env key name preserved");

const query = "https://example.com/hook?api_key=supersecretvalue&topic=user.created";
const redacted_query = redactSecrets(query);
assertNotIncludes(redacted_query, "supersecretvalue", "query api_key redacted");
assert(redacted_query.includes("api_key=[REDACTED]"), "query param name preserved");

const long = "x".repeat(50);
const truncated = redactSecrets(long, 10);
assert(truncated.length === 11, "maxLen adds ellipsis char");
assert(truncated.endsWith("…"), "maxLen suffix");
assert(truncated.startsWith("x".repeat(10)), "maxLen prefix preserved");
}

function testCollectEnvSecretValues(): void {
withEnv(
{
OUTPOST_API_KEY: "short",
ANTHROPIC_API_KEY: undefined,
},
() => {
assert(
collectEnvSecretValues().length === 0,
"secrets shorter than 8 chars are ignored",
);
},
);

withEnv(
{
OUTPOST_API_KEY: "opst_test_key_abcdefghij",
ANTHROPIC_API_KEY: "anthropic_fake_key_abcdefghijklmnop",
},
() => {
const values = collectEnvSecretValues();
assert(values.length === 2, "collects both env secrets when long enough");
assert(
values.includes("opst_test_key_abcdefghij"),
"includes OUTPOST_API_KEY value",
);
},
);
}

function testWebhookUrlLiteralRedaction(): void {
const fake_webhook_url = "https://events.example.test/webhook/fake_ci_destination_path_01";
withEnv(
{
EVAL_TEST_DESTINATION_URL: fake_webhook_url,
OUTPOST_TEST_WEBHOOK_URL: fake_webhook_url,
OUTPOST_API_KEY: undefined,
ANTHROPIC_API_KEY: undefined,
},
() => {
assert(collectEnvSecretValues().length === 1, "dedupes identical webhook URL env values");
const raw = `Turn 0 prompt includes test destination: ${fake_webhook_url}`;
const redacted = redactSecretsForArtifact(raw);
assertNotIncludes(redacted, fake_webhook_url, "webhook URL literal redacted");
assert(redacted.includes("[REDACTED]"), "webhook placeholder present");
},
);
}

function testRedactSecretsForArtifact(): void {
withEnv(
{
OUTPOST_API_KEY: "opst_literal_echo_12345678",
ANTHROPIC_API_KEY: undefined,
},
() => {
const raw =
"Agent echoed the key verbatim: opst_literal_echo_12345678 in tool output";
const redacted = redactSecretsForArtifact(raw);
assertNotIncludes(redacted, "opst_literal_echo_12345678", "literal env value redacted");
assert(redacted.includes("[REDACTED]"), "literal placeholder present");
},
);
}

function testRedactEvalArtifactJson(): void {
withEnv(
{
OUTPOST_API_KEY: "opst_json_embed_123456789",
ANTHROPIC_API_KEY: undefined,
},
() => {
const payload = {
meta: { scenarioId: "02" },
messages: [
{
role: "assistant",
content: "export OUTPOST_API_KEY=opst_json_embed_123456789",
},
],
};
const out = redactEvalArtifactJson(payload);
assert(out.endsWith("\n"), "artifact JSON ends with newline");
assertNotIncludes(out, "opst_json_embed_123456789", "JSON artifact redacts secrets");
assert(out.includes('"scenarioId": "02"'), "non-secret JSON preserved");
JSON.parse(out.trim());
},
);
}

function testRedactEvalArtifactJsonValid(): void {
const fake_webhook_url = "https://events.example.test/webhook/fake_ci_destination_path_01";
withEnv(
{
OUTPOST_API_KEY: "opst_json_embed_123456789",
EVAL_TEST_DESTINATION_URL: fake_webhook_url,
OUTPOST_TEST_WEBHOOK_URL: fake_webhook_url,
ANTHROPIC_API_KEY: undefined,
},
() => {
const payload = {
meta: { scenarioId: "01" },
messages: [
{
role: "assistant",
content: `Use ${fake_webhook_url} with key opst_json_embed_123456789`,
},
],
};
const out = redactEvalArtifactJson(payload);
const parsed = JSON.parse(out.trim()) as {
messages: { content: string }[];
};
assertNotIncludes(out, fake_webhook_url, "serialized JSON has no raw webhook URL");
assertNotIncludes(out, "opst_json_embed_123456789", "serialized JSON has no raw API key");
assert(parsed.messages[0]!.content.includes("[REDACTED]"), "content redacted in structure");
},
);
}

function main(): void {
testRedactSecretsPatterns();
testCollectEnvSecretValues();
testWebhookUrlLiteralRedaction();
testRedactSecretsForArtifact();
testRedactEvalArtifactJson();
testRedactEvalArtifactJsonValid();
console.error("redact-secrets.test: OK");
}

main();
Loading
Loading