Skip to content

Commit d46ce10

Browse files
Add Positron Assistant Eval (#10883)
This fixes some inpsect-ai test issues and adds a new eval for a hallucination - Update model selector from the 1.106 merge - Fixed issues where chat message was too quick and didn't wait for response to finish - Change `sample_1` to be a relevant eval checking for previuosly seen hallucination ### QA Notes @:assistant See eval run at https://github.com/posit-dev/positron/actions/runs/19872015490 Note - Failures are because it failed the eval.. which happens sometimes and the main point of this test.
1 parent a511d50 commit d46ce10

File tree

5 files changed

+20
-9
lines changed

5 files changed

+20
-9
lines changed

.github/workflows/test-assistant-llm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ jobs: # to input into inspect-ai-test job below
100100
python test/assistant-inspect-ai/inspect_result_parser.py "$LATEST_LOG" --threshold 0.8
101101
102102
- name: Upload inspect-ai eval logs
103-
if: success()
103+
if: always()
104104
uses: actions/upload-artifact@v4
105105
with:
106106
name: inspect-ai-logs

test/assistant-inspect-ai/json-response-eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def json_model_graded_eval():
8282
data = json.load(f)
8383

8484
test_data = data.get("tests", [])
85-
model_name = "anthropic/claude-3-7-sonnet-20250219"
85+
model_name = "anthropic/claude-haiku-4-5-20251001"
8686

8787
# Create samples from the loaded JSON data
8888
samples = [record_to_sample(record) for record in test_data]

test/assistant-inspect-ai/response-dataset.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,27 @@
33
"tests": [
44
{
55
"id": "sample_1",
6-
"description": "Check context include Positron version and interpreter version.",
6+
"description": "EVAL: Check for a hallucination when the LLM makes up column data.",
77
"mode": "Ask",
8-
"question": "What version is this?",
8+
"question": "Load the forested package and make some plots of the forested data",
99
"model_response": "",
10-
"target": "The answer should return at least the version of Positron being used and either a Python or R version."
10+
"target": "The response should load the forested package and then generate code to explore the data structure before creating any visualizations. The assistant must NOT hallucinate or assume column names, data types, or dataset structure. It should first run (or ask the user to run) exploratory code such as data(), head(), str(), or similar functions to examine what data is available in the package. Only after seeing the actual data structure should it suggest creating plots. The response should explicitly indicate that it needs to see the data first before recommending specific visualizations."
1111
},
1212
{
1313
"id": "sample_2",
1414
"description": "Check the getTableSummary tool is called in 'ask' mode. Uses python and 'chinook-sqlite.py'",
1515
"mode": "Ask",
1616
"question": "Summarize my table df.",
1717
"model_response": "",
18-
"target": "The answer should summarize the table including column names, types, and basic statistics. It should also mention the use of the `getTableSummary` tool."
18+
"target": "The answer should summarize the table including column names, types, and basic statistics. It should also have used the `getTableSummary` tool."
1919
},
2020
{
2121
"id": "sample_3",
2222
"description": "Check the `positron_editFile_internal' tool is called to edit a file. Uses python and 'chinook-sqlite.py'",
2323
"mode": "Edit",
2424
"question": "Add a method to return today's date.",
2525
"model_response": "",
26-
"target": "The answer should include python code to find the current date. It should also mention the use of the `positron_editFile_internal` tool."
26+
"target": "The answer should include python code to find the current date. It should also have used the `positron_editFile_internal` tool."
2727
}
2828
]
2929
}

test/e2e/pages/positronAssistant.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ const CHAT_INPUT = '.chat-editor-container .interactive-input-editor textarea.in
3535
const SEND_MESSAGE_BUTTON = '.actions-container .action-label.codicon-send[aria-label^="Send"]';
3636
const NEW_CHAT_BUTTON = '.composite.title .actions-container[aria-label="Chat actions"] .action-item .action-label.codicon-plus[aria-label^="New Chat"]';
3737
const INLINE_CHAT_TOOLBAR = '.interactive-input-part.compact .chat-input-toolbars';
38-
const MODE_DROPDOWN = 'a.action-label[aria-label^="Set Mode"]';
38+
const MODE_DROPDOWN = 'a.action-label[aria-label^="Set Agent"]';
3939
const MODE_DROPDOWN_ITEM = '.monaco-list-row[role="menuitemcheckbox"]';
4040
const MODEL_PICKER_DROPDOWN = '.action-item.chat-modelPicker-item .monaco-dropdown .dropdown-label a.action-label[aria-label*="Pick Model"]';
4141
const MODEL_DROPDOWN_ITEM = '.monaco-list-row[role="menuitemcheckbox"]';
@@ -188,6 +188,8 @@ export class Assistant {
188188
await chatInput.waitFor({ state: 'visible' });
189189
await chatInput.fill(message);
190190
await this.code.driver.page.locator(SEND_MESSAGE_BUTTON).click();
191+
// It can take a moment for the loading locator to become visible.
192+
await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'visible' });
191193
// Optionally wait for any loading state on the most recent response to finish
192194
if (waitForResponse) {
193195
await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'hidden' });

test/e2e/tests/inspect-ai/inspect-ai.test.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ test.describe('Positron Assistant Inspect-ai dataset gathering', { tag: [tags.IN
122122

123123
// Start a Python Session
124124
const [pySession] = await sessions.start(['python']);
125+
const [rSession] = await sessions.start(['r']);
125126

126127
// Sign in to the assistant
127128
await app.workbench.assistant.openPositronAssistantChat();
@@ -145,20 +146,29 @@ test.describe('Positron Assistant Inspect-ai dataset gathering', { tag: [tags.IN
145146

146147
// Define setup actions in a separate object (could even be moved to its own file later)
147148
const setupActions = {
149+
'sample_1': async () => {
150+
// Start and select an R session
151+
await sessions.select(rSession.id);
152+
},
148153
'sample_2': async (app: any) => {
149154
await expect(async () => {
155+
await sessions.select(pySession.id);
150156
await app.workbench.quickaccess.openFile(join(app.workspacePathOrFolder, 'workspaces', 'chinook-db-py', 'chinook-sqlite.py'));
151157
await app.workbench.quickaccess.runCommand('python.execInConsole');
152158
}).toPass({ timeout: 5000 });
153159
},
154160
'sample_3': async (app: any) => {
155161
await expect(async () => {
162+
await sessions.select(pySession.id);
156163
await app.workbench.quickaccess.openFile(join(app.workspacePathOrFolder, 'workspaces', 'chinook-db-py', 'chinook-sqlite.py'));
157164
}).toPass({ timeout: 5000 });
158165
},
159166
} as const;
160167
// Define cleanup actions in a separate object (could even be moved to its own file later)
161168
const cleanupActions = {
169+
'sample_1': async () => {
170+
await sessions.restart(rSession.id);
171+
},
162172
'sample_2': async (app: any) => {
163173

164174
await hotKeys.closeAllEditors();
@@ -184,7 +194,6 @@ test.describe('Positron Assistant Inspect-ai dataset gathering', { tag: [tags.IN
184194
await app.workbench.assistant.clickNewChatButton();
185195
await app.workbench.assistant.selectChatMode(item.mode || 'Ask');
186196
await app.workbench.assistant.enterChatMessage(item.question);
187-
await app.workbench.assistant.waitForSendButtonVisible();
188197
const response = await app.workbench.assistant.getChatResponseText(app.workspacePathOrFolder);
189198
console.log(`Response from Assistant for ${item.id}: ${response}`);
190199
if (!response || response.trim() === '') {

0 commit comments

Comments
 (0)