Skip to content

Commit 269ae79

Browse files
authored
Merge pull request #27 from navapbc/kaytv/evals
feat: scorers added to web-agent
2 parents ff5a22f + 03e31af commit 269ae79

File tree

8 files changed

+359
-2
lines changed

8 files changed

+359
-2
lines changed

mastra-test-app/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
"@ai-sdk/google-vertex": "^2.2.27",
3232
"@inquirer/prompts": "^7.7.1",
3333
"@mastra/core": "^0.13.2",
34+
"@mastra/evals": "^0.12.0",
3435
"@mastra/libsql": "^0.13.2",
3536
"@mastra/loggers": "^0.10.6",
3637
"@mastra/mcp": "^0.10.11",

mastra-test-app/pnpm-lock.yaml

Lines changed: 78 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

mastra-test-app/src/mastra/agents/web-automation-agent.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
1-
import { postgresStore, pgVector } from '../storage';
1+
import {
2+
createAnswerRelevancyScorer,
3+
createToxicityScorer
4+
} from "@mastra/evals/scorers/llm";
25
import { exaMCP, playwrightMCP } from '../mcp';
6+
import { pgVector, postgresStore } from '../storage';
37

48
import { Agent } from '@mastra/core/agent';
59
import { Memory } from '@mastra/memory';
610
import { anthropic } from '@ai-sdk/anthropic';
11+
import { createLanguagePreferenceScorer } from "../scorers/languagePreference";
712
import { databaseTools } from '../tools/database-tools';
813
import { google } from '@ai-sdk/google';
914
import { openai } from '@ai-sdk/openai';
@@ -76,7 +81,7 @@ export const webAutomationAgent = new Agent({
7681
**Web Navigation:**
7782
- Navigate to websites and analyze page structure
7883
- If participant has a preferred language, immediately look for and change the website language
79-
- Common language selectors: language dropdowns, flag icons, "EN" buttons, or language preference settings
84+
- Common language selectors: "Select Language" dropdowns, flag icons, buttons that say "EN" or "SP", or language preference settings
8085
- Identify and interact with elements (buttons, forms, links, dropdowns)
8186
8287
When performing actions:
@@ -150,6 +155,22 @@ export const webAutomationAgent = new Agent({
150155
)
151156
},
152157
memory: memory,
158+
scorers: {
159+
relevancy: {
160+
scorer: createAnswerRelevancyScorer({ model: google("gemini-2.5-pro") }),
161+
sampling: { type: "ratio", rate: 0.5 }
162+
},
163+
safety: {
164+
scorer: createToxicityScorer({ model: google("gemini-2.5-pro") }),
165+
sampling: { type: "ratio", rate: 1 }
166+
},
167+
languagePreference: {
168+
scorer: createLanguagePreferenceScorer({
169+
model: google("gemini-2.5-pro"),
170+
}),
171+
sampling: { rate: 1, type: "ratio" },
172+
},
173+
},
153174
defaultStreamOptions: {
154175
maxSteps: 50,
155176
maxRetries: 3,
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import { LANGUAGE_PREFERENCE_PROMPT, createPreprocessPrompt, createAnalysisPrompt, createReasonPrompt } from './prompt';
2+
3+
import { LanguageModel } from '@mastra/core';
4+
import { createScorer } from '@mastra/core/scores';
5+
import { z } from 'zod';
6+
7+
export function createLanguagePreferenceScorer({
8+
model,
9+
}: {
10+
model: LanguageModel;
11+
}) {
12+
return createScorer({
13+
name: 'Language Preference Compliance',
14+
description: 'Evaluates if the web automation agent changes website language to match participant language preferences',
15+
judge: {
16+
model,
17+
instructions: LANGUAGE_PREFERENCE_PROMPT
18+
}
19+
})
20+
.preprocess({
21+
description: 'Extract language preferences and actions from the conversation',
22+
outputSchema: z.object({
23+
participantLanguage: z.string().nullable(),
24+
languageChangeActions: z.array(z.string()),
25+
websiteLanguageSet: z.boolean(),
26+
targetLanguage: z.string().nullable()
27+
}),
28+
createPrompt: ({ run }) => {
29+
// For web automation agent, the output contains the agent's actions and reasoning
30+
const agentOutput = Array.isArray(run.output) ?
31+
run.output.map(msg => msg.content).join('\n') :
32+
run.output?.text || run.output || '';
33+
34+
const userInput = Array.isArray(run.input) ?
35+
run.input.map(msg => msg.content).join('\n') :
36+
run.input?.text || run.input || '';
37+
38+
return createPreprocessPrompt({ userInput, agentOutput });
39+
},
40+
})
41+
.analyze({
42+
description: 'Evaluate language preference compliance',
43+
outputSchema: z.object({
44+
compliance: z.enum(['excellent', 'good', 'partial', 'poor', 'no_preference']),
45+
languageMatch: z.boolean(),
46+
actionsTaken: z.boolean(),
47+
confidence: z.number().min(0).max(1),
48+
}),
49+
createPrompt: ({ run, results }) => {
50+
const { participantLanguage, languageChangeActions, websiteLanguageSet, targetLanguage } = results.preprocessStepResult;
51+
52+
return createAnalysisPrompt({
53+
participantLanguage,
54+
languageChangeActions,
55+
websiteLanguageSet,
56+
targetLanguage
57+
});
58+
},
59+
})
60+
.generateScore(({ results }) => {
61+
const { compliance, confidence } = results.analyzeStepResult;
62+
63+
// Convert compliance level to numerical score
64+
const complianceScores = {
65+
'excellent': 1.0,
66+
'good': 0.8,
67+
'partial': 0.5,
68+
'poor': 0.2,
69+
'no_preference': 1.0 // No penalty if no preference was specified
70+
};
71+
72+
const baseScore = complianceScores[compliance] || 0;
73+
return baseScore * confidence;
74+
})
75+
.generateReason({
76+
description: 'Generate a reason for the language preference compliance score',
77+
createPrompt: ({ results, score }) => {
78+
const { compliance, languageMatch, actionsTaken } = results.analyzeStepResult;
79+
const { participantLanguage, targetLanguage } = results.preprocessStepResult;
80+
81+
return createReasonPrompt({
82+
score,
83+
compliance,
84+
languageMatch,
85+
actionsTaken,
86+
participantLanguage,
87+
targetLanguage
88+
});
89+
},
90+
});
91+
}

0 commit comments

Comments
 (0)