browserstack
diff --git a/‎ai-evals/.eslintrc.json‎
Lines changed: 19 additions & 0 deletions b/‎ai-evals/.eslintrc.json‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎ai-evals/.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎ai-evals/.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎ai-evals/README.md‎
Lines changed: 75 additions & 0 deletions b/‎ai-evals/README.md‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎ai-evals/action.yml‎
Lines changed: 45 additions & 0 deletions b/‎ai-evals/action.yml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎ai-evals/dist/index.js‎
Lines changed: 192 additions & 0 deletions b/‎ai-evals/dist/index.js‎
Lines changed: 192 additions & 0 deletions
diff --git a/‎ai-evals/dist/index.js.map‎
Lines changed: 7 additions & 0 deletions b/‎ai-evals/dist/index.js.map‎
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,19 @@
+{
+  "root": true,
+  "parser": "@typescript-eslint/parser",
+  "plugins": ["@typescript-eslint"],
+  "extends": [
+    "eslint:recommended",
+    "plugin:@typescript-eslint/recommended"
+  ],
+  "env": {
+    "node": true,
+    "es2022": true,
+    "mocha": true
+  },
+  "rules": {
+    "@typescript-eslint/no-explicit-any": "off",
+    "@typescript-eslint/no-unused-vars": ["error", { "argsIgnorePattern": "^_" }]
+  },
+  "ignorePatterns": ["dist/", "node_modules/", "coverage/"]
+}
@@ -0,0 +1,4 @@
+node_modules/
+*.log
+.nyc_output/
+coverage/
@@ -0,0 +1,75 @@
+# BrowserStack AI Evals — GitHub Action
+
+Run AI evaluation experiments on every pull request. Compares scores against the previous baseline and reports pass/regression status with a sticky PR comment, Job Summary, and CI metadata tracking.
+
+## How it works
+
+1. Looks up the experiment by name (configured in the BrowserStack AI Evals UI)
+2. Triggers a new experiment run with CI metadata (branch, commit, actor, PR number)
+3. Waits for the run to complete
+4. Fetches a server-computed comparison against the previous baseline run
+5. Posts a sticky PR comment and Job Summary with per-evaluator scores, deltas, and threshold status
+6. Fails the job if any threshold is breached (configurable)
+
+## Quickstart
+
+```yaml
+name: AI Evals
+on:
+  pull_request:
+    paths: ['src/**', 'prompts/**']
+
+jobs:
+  evals:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+      - uses: browserstack/github-actions/ai-evals@v1
+        with:
+          experiment: refund-bot-eval
+          public-key: ${{ secrets.AISDK_PUBLIC_KEY }}
+          secret-key: ${{ secrets.AISDK_SECRET_KEY }}
+```
+
+## Inputs
+
+| Name | Required | Default | Description |
+|---|---|---|---|
+| `experiment` | yes | — | Experiment name (configured in the UI). |
+| `public-key` | no | — | API public key. Falls back to `AISDK_PUBLIC_KEY` env var. |
+| `secret-key` | no | — | API secret key. Falls back to `AISDK_SECRET_KEY` env var. |
+| `github-token` | no | `${{ github.token }}` | Token for the PR comment. |
+| `fail-on-regression` | no | `true` | Fail the job when a threshold is breached. |
+| `comment-on-pr` | no | `true` | Post/edit a sticky PR comment. |
+| `timeout` | no | `900` | Max seconds to wait for the run to complete and its comparison scores to be ready. |
+
+## Exit codes
+
+| Code | Meaning |
+|---|---|
+| 0 | All thresholds passed |
+| 1 | At least one threshold breached |
+| 2 | Experiment not found |
+| 3 | Run failed or timed out |
+
+## Multiple experiments
+
+Each experiment gets its own sticky comment. Run them in parallel or sequence:
+
+```yaml
+steps:
+  - uses: browserstack/github-actions/ai-evals@v1
+    with:
+      experiment: refund-bot-eval
+      public-key: ${{ secrets.AISDK_PUBLIC_KEY }}
+      secret-key: ${{ secrets.AISDK_SECRET_KEY }}
+
+  - uses: browserstack/github-actions/ai-evals@v1
+    with:
+      experiment: search-ranking-eval
+      public-key: ${{ secrets.AISDK_PUBLIC_KEY }}
+      secret-key: ${{ secrets.AISDK_SECRET_KEY }}
+```
@@ -0,0 +1,45 @@
+name: 'BrowserStack AI Evals'
+description: 'Run AI evaluation experiments, compare scores against the previous baseline, and report pass/regression status with a PR comment and Job Summary.'
+author: 'BrowserStack'
+branding:
+  icon: 'check-circle'
+  color: 'green'
+
+inputs:
+  experiment:
+    description: 'Name of an Experiment in BrowserStack AI Evals to run. The Action triggers the experiment (prompt + dataset + evaluators + thresholds configured in the UI) and waits for results.'
+    required: true
+
+  public-key:
+    description: 'BrowserStack AI Evals public API key. Falls back to the AISDK_PUBLIC_KEY environment variable when omitted.'
+    required: false
+    default: ''
+
+  secret-key:
+    description: 'BrowserStack AI Evals secret API key. Falls back to the AISDK_SECRET_KEY environment variable when omitted.'
+    required: false
+    default: ''
+
+  github-token:
+    description: 'Token used to post the sticky PR comment. Defaults to the workflow GITHUB_TOKEN; override only if you need to post as a different identity (e.g., a GitHub App).'
+    required: false
+    default: ${{ github.token }}
+
+  fail-on-regression:
+    description: 'Exit with a non-zero code if any evaluator breaches its threshold. Set to "false" to report without blocking the PR.'
+    required: false
+    default: 'true'
+
+  comment-on-pr:
+    description: 'When running on a pull_request event, post (or edit) a sticky summary comment on the PR. Set to "false" to disable.'
+    required: false
+    default: 'true'
+
+  timeout:
+    description: 'Maximum time (in seconds) to wait for the experiment run to complete and its comparison scores to be ready. Applies to both lifecycle polling and score aggregation polling. Default is 900 (15 minutes).'
+    required: false
+    default: '900'
+
+runs:
+  using: 'node20'
+  main: 'dist/index.js'