Add Positron Assistant Eval (#10883)

jonvanausdeln · web-flow · commit d46ce10821cb · 2025-12-02T15:01:06.000-08:00
This fixes some inpsect-ai test issues and adds a new eval for a hallucination - Update model selector from the 1.106 merge - Fixed issues where chat message was too quick and didn't wait for response to finish - Change `sample_1` to be a relevant eval checking for previuosly seen hallucination ### QA Notes @:assistant See eval run at https://github.com/posit-dev/positron/actions/runs/19872015490 Note - Failures are because it failed the eval.. which happens sometimes and the main point of this test.
diff --git a/.github/workflows/test-assistant-llm.yml b/.github/workflows/test-assistant-llm.yml
@@ -100,7 +100,7 @@ jobs:  # to input into inspect-ai-test job below
           python test/assistant-inspect-ai/inspect_result_parser.py "$LATEST_LOG" --threshold 0.8
 
       - name: Upload inspect-ai eval logs
-        if: success()
+        if: always()
         uses: actions/upload-artifact@v4
         with:
           name: inspect-ai-logs
diff --git a/test/assistant-inspect-ai/json-response-eval.py b/test/assistant-inspect-ai/json-response-eval.py
@@ -82,7 +82,7 @@ def json_model_graded_eval():
 		data = json.load(f)
 
 	test_data = data.get("tests", [])
-	model_name = "anthropic/claude-3-7-sonnet-20250219"
+	model_name = "anthropic/claude-haiku-4-5-20251001"
 
 	# Create samples from the loaded JSON data
 	samples = [record_to_sample(record) for record in test_data]
diff --git a/test/assistant-inspect-ai/response-dataset.json b/test/assistant-inspect-ai/response-dataset.json
@@ -3,27 +3,27 @@
 	"tests": [
 		{
 			"id": "sample_1",
-			"description": "Check context include Positron version and interpreter version.",
+			"description": "EVAL: Check for a hallucination when the LLM makes up column data.",
 			"mode": "Ask",
-			"question": "What version is this?",
+			"question": "Load the forested package and make some plots of the forested data",
 			"model_response": "",
-			"target": "The answer should return at least the version of Positron being used and either a Python or R version."
+			"target": "The response should load the forested package and then generate code to explore the data structure before creating any visualizations. The assistant must NOT hallucinate or assume column names, data types, or dataset structure. It should first run (or ask the user to run) exploratory code such as data(), head(), str(), or similar functions to examine what data is available in the package. Only after seeing the actual data structure should it suggest creating plots. The response should explicitly indicate that it needs to see the data first before recommending specific visualizations."
 		},
 		{
 			"id": "sample_2",
 			"description": "Check the getTableSummary tool is called in 'ask' mode. Uses python and 'chinook-sqlite.py'",
 			"mode": "Ask",
 			"question": "Summarize my table df.",
 			"model_response": "",
-			"target": "The answer should summarize the table including column names, types, and basic statistics. It should also mention the use of the `getTableSummary` tool."
+			"target": "The answer should summarize the table including column names, types, and basic statistics. It should also have used the `getTableSummary` tool."
 		},
 		{
 			"id": "sample_3",
 			"description": "Check the `positron_editFile_internal' tool is called to edit a file. Uses python and 'chinook-sqlite.py'",
 			"mode": "Edit",
 			"question": "Add a method to return today's date.",
 			"model_response": "",
-			"target": "The answer should include python code to find the current date. It should also mention the use of the `positron_editFile_internal` tool."
+			"target": "The answer should include python code to find the current date. It should also have used the `positron_editFile_internal` tool."
 		}
 	]
 }
diff --git a/test/e2e/pages/positronAssistant.ts b/test/e2e/pages/positronAssistant.ts
@@ -35,7 +35,7 @@ const CHAT_INPUT = '.chat-editor-container .interactive-input-editor textarea.in
 const SEND_MESSAGE_BUTTON = '.actions-container .action-label.codicon-send[aria-label^="Send"]';
 const NEW_CHAT_BUTTON = '.composite.title .actions-container[aria-label="Chat actions"] .action-item .action-label.codicon-plus[aria-label^="New Chat"]';
 const INLINE_CHAT_TOOLBAR = '.interactive-input-part.compact .chat-input-toolbars';
-const MODE_DROPDOWN = 'a.action-label[aria-label^="Set Mode"]';
+const MODE_DROPDOWN = 'a.action-label[aria-label^="Set Agent"]';
 const MODE_DROPDOWN_ITEM = '.monaco-list-row[role="menuitemcheckbox"]';
 const MODEL_PICKER_DROPDOWN = '.action-item.chat-modelPicker-item .monaco-dropdown .dropdown-label a.action-label[aria-label*="Pick Model"]';
 const MODEL_DROPDOWN_ITEM = '.monaco-list-row[role="menuitemcheckbox"]';
@@ -188,6 +188,8 @@ export class Assistant {
 		await chatInput.waitFor({ state: 'visible' });
 		await chatInput.fill(message);
 		await this.code.driver.page.locator(SEND_MESSAGE_BUTTON).click();
+		// It can take a moment for the loading locator to become visible.
+		await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'visible' });
 		// Optionally wait for any loading state on the most recent response to finish
 		if (waitForResponse) {
 			await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'hidden' });
diff --git a/test/e2e/tests/inspect-ai/inspect-ai.test.ts b/test/e2e/tests/inspect-ai/inspect-ai.test.ts
@@ -122,6 +122,7 @@ test.describe('Positron Assistant Inspect-ai dataset gathering', { tag: [tags.IN
 
 		// Start a Python Session
 		const [pySession] = await sessions.start(['python']);
+		const [rSession] = await sessions.start(['r']);
 
 		// Sign in to the assistant
 		await app.workbench.assistant.openPositronAssistantChat();
@@ -145,20 +146,29 @@ test.describe('Positron Assistant Inspect-ai dataset gathering', { tag: [tags.IN
 
 		// Define setup actions in a separate object (could even be moved to its own file later)
 		const setupActions = {
+			'sample_1': async () => {
+				// Start and select an R session
+				await sessions.select(rSession.id);
+			},
 			'sample_2': async (app: any) => {
 				await expect(async () => {
+					await sessions.select(pySession.id);
 					await app.workbench.quickaccess.openFile(join(app.workspacePathOrFolder, 'workspaces', 'chinook-db-py', 'chinook-sqlite.py'));
 					await app.workbench.quickaccess.runCommand('python.execInConsole');
 				}).toPass({ timeout: 5000 });
 			},
 			'sample_3': async (app: any) => {
 				await expect(async () => {
+					await sessions.select(pySession.id);
 					await app.workbench.quickaccess.openFile(join(app.workspacePathOrFolder, 'workspaces', 'chinook-db-py', 'chinook-sqlite.py'));
 				}).toPass({ timeout: 5000 });
 			},
 		} as const;
 		// Define cleanup actions in a separate object (could even be moved to its own file later)
 		const cleanupActions = {
+			'sample_1': async () => {
+				await sessions.restart(rSession.id);
+			},
 			'sample_2': async (app: any) => {
 
 				await hotKeys.closeAllEditors();
@@ -184,7 +194,6 @@ test.describe('Positron Assistant Inspect-ai dataset gathering', { tag: [tags.IN
 			await app.workbench.assistant.clickNewChatButton();
 			await app.workbench.assistant.selectChatMode(item.mode || 'Ask');
 			await app.workbench.assistant.enterChatMessage(item.question);
-			await app.workbench.assistant.waitForSendButtonVisible();
 			const response = await app.workbench.assistant.getChatResponseText(app.workspacePathOrFolder);
 			console.log(`Response from Assistant for ${item.id}: ${response}`);
 			if (!response || response.trim() === '') {

Original file line number	Diff line number	Diff line change
`@@ -3,27 +3,27 @@`
`3`	`3`	`"tests": [`
`4`	`4`	`{`
`5`	`5`	`"id": "sample_1",`
`6`		`- "description": "Check context include Positron version and interpreter version.",`
	`6`	`+ "description": "EVAL: Check for a hallucination when the LLM makes up column data.",`
`7`	`7`	`"mode": "Ask",`
`8`		`- "question": "What version is this?",`
	`8`	`+ "question": "Load the forested package and make some plots of the forested data",`
`9`	`9`	`"model_response": "",`
`10`		`- "target": "The answer should return at least the version of Positron being used and either a Python or R version."`
	`10`	+ "target": "The response should load the forested package and then generate code to explore the data structure before creating any visualizations. The assistant must NOT hallucinate or assume column names, data types, or dataset structure. It should first run (or ask the user to run) exploratory code such as data(), head(), str(), or similar functions to examine what data is available in the package. Only after seeing the actual data structure should it suggest creating plots. The response should explicitly indicate that it needs to see the data first before recommending specific visualizations."
`11`	`11`	`},`
`12`	`12`	`{`
`13`	`13`	`"id": "sample_2",`
`14`	`14`	`"description": "Check the getTableSummary tool is called in 'ask' mode. Uses python and 'chinook-sqlite.py'",`
`15`	`15`	`"mode": "Ask",`
`16`	`16`	`"question": "Summarize my table df.",`
`17`	`17`	`"model_response": "",`
`18`		- "target": "The answer should summarize the table including column names, types, and basic statistics. It should also mention the use of the `getTableSummary` tool."
	`18`	+ "target": "The answer should summarize the table including column names, types, and basic statistics. It should also have used the `getTableSummary` tool."
`19`	`19`	`},`
`20`	`20`	`{`
`21`	`21`	`"id": "sample_3",`
`22`	`22`	"description": "Check the `positron_editFile_internal' tool is called to edit a file. Uses python and 'chinook-sqlite.py'",
`23`	`23`	`"mode": "Edit",`
`24`	`24`	`"question": "Add a method to return today's date.",`
`25`	`25`	`"model_response": "",`
`26`		- "target": "The answer should include python code to find the current date. It should also mention the use of the `positron_editFile_internal` tool."
	`26`	+ "target": "The answer should include python code to find the current date. It should also have used the `positron_editFile_internal` tool."
`27`	`27`	`}`
`28`	`28`	`]`
`29`	`29`	`}`