Skip to content

Commit 18785d2

Browse files
paulbatumCopilot
andauthored
Improve multi-tool agent tests for robustness (#44354)
* Improve multi-tool agent tests for robustness and model-agnostic behavior Tests now properly verify that all configured tools are actually used. Tests improved: - test_agent_file_search_and_code_interpreter.py: test_find_and_analyze_data, test_analyze_code_file - test_agent_code_interpreter_and_function.py: test_calculate_and_save, test_generate_data_and_report - test_agent_file_search_and_function.py: test_python_code_file_search - test_agent_file_search_code_interpreter_function.py: test_complete_analysis_workflow Changes: - Use non-trivial calculations that require actual tool execution - Add assertions verifying function calls are made - Add validation of computed values in function arguments - Replace Unicode checkmark with [PASS] for Windows compatibility - Remove test_four_tools_combination (was not actually testing tool usage) * fix * update recordings * fix encoding issue that breaks recordings * updated recordings * Update sdk/ai/azure-ai-projects/tests/agents/tools/multitool/test_agent_code_interpreter_and_function.py Co-authored-by: Copilot <[email protected]> --------- Co-authored-by: Copilot <[email protected]>
1 parent 94a446d commit 18785d2

File tree

6 files changed

+190
-149
lines changed

6 files changed

+190
-149
lines changed

sdk/ai/azure-ai-projects/assets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/ai/azure-ai-projects",
5-
"Tag": "python/ai/azure-ai-projects_febb246e47"
5+
"Tag": "python/ai/azure-ai-projects_314598932e"
66
}

sdk/ai/azure-ai-projects/tests/agents/tools/multitool/test_agent_code_interpreter_and_function.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@
1212
All tests use the same tool combination but different inputs and workflows.
1313
"""
1414

15+
import json
1516
from test_base import TestBase, servicePreparer
1617
from devtools_testutils import recorded_by_proxy, RecordedTransport
1718
from azure.ai.projects.models import PromptAgentDefinition, CodeInterpreterTool, CodeInterpreterToolAuto, FunctionTool
18-
19+
from openai.types.responses.response_input_param import FunctionCallOutput, ResponseInputParam
1920

2021
class TestAgentCodeInterpreterAndFunction(TestBase):
2122
"""Tests for agents using Code Interpreter + Function Tool combination."""
@@ -25,6 +26,10 @@ class TestAgentCodeInterpreterAndFunction(TestBase):
2526
def test_calculate_and_save(self, **kwargs):
2627
"""
2728
Test calculation with Code Interpreter and saving with Function Tool.
29+
30+
This test verifies that both tools are used:
31+
1. Code Interpreter: Performs a calculation that requires actual computation
32+
2. Function Tool: Saves the computed result
2833
"""
2934

3035
model = self.test_agents_params["model_deployment_name"]
@@ -36,24 +41,25 @@ def test_calculate_and_save(self, **kwargs):
3641
# Define function tool
3742
func_tool = FunctionTool(
3843
name="save_result",
39-
description="Save analysis result",
44+
description="Save the calculation result. Must be called to persist the result.",
4045
parameters={
4146
"type": "object",
4247
"properties": {
43-
"result": {"type": "string", "description": "The result"},
48+
"calculation": {"type": "string", "description": "Description of the calculation"},
49+
"result": {"type": "string", "description": "The numerical result"},
4450
},
45-
"required": ["result"],
51+
"required": ["calculation", "result"],
4652
"additionalProperties": False,
4753
},
4854
strict=True,
4955
)
5056

51-
# Create agent
57+
# Create agent with explicit instructions to use both tools
5258
agent = project_client.agents.create_version(
5359
agent_name="code-func-agent",
5460
definition=PromptAgentDefinition(
5561
model=model,
56-
instructions="Run calculations and save results.",
62+
instructions="You are a calculator assistant. Use code interpreter to perform calculations, then ALWAYS save the result using the save_result function.",
5763
tools=[
5864
CodeInterpreterTool(container=CodeInterpreterToolAuto()),
5965
func_tool,
@@ -63,9 +69,10 @@ def test_calculate_and_save(self, **kwargs):
6369
)
6470
print(f"Agent created (id: {agent.id})")
6571

66-
# Use the agent
72+
# Request a calculation that requires Code Interpreter (not trivial math)
73+
# 17^4 = 83521 - not something easily computed mentally
6774
response = openai_client.responses.create(
68-
input="Calculate 5 + 3 and save the result.",
75+
input="Calculate 17 to the power of 4 using code, then save the result.",
6976
extra_body={"agent": {"name": agent.name, "type": "agent_reference"}},
7077
)
7178
self.validate_response(response)
@@ -79,6 +86,10 @@ def test_calculate_and_save(self, **kwargs):
7986
def test_generate_data_and_report(self, **kwargs):
8087
"""
8188
Test generating data with Code Interpreter and reporting with Function.
89+
90+
This test verifies that both tools are used:
91+
1. Code Interpreter: Generates random data and calculates statistics
92+
2. Function Tool: Creates a report with the computed statistics
8293
"""
8394

8495
model = self.test_agents_params["model_deployment_name"]
@@ -90,25 +101,27 @@ def test_generate_data_and_report(self, **kwargs):
90101
# Define function tool
91102
report_function = FunctionTool(
92103
name="generate_report",
93-
description="Generate a report with the provided data",
104+
description="Generate and save a report with the analysis results. Must be called to create the report.",
94105
parameters={
95106
"type": "object",
96107
"properties": {
97108
"title": {"type": "string", "description": "Report title"},
98-
"summary": {"type": "string", "description": "Report summary"},
109+
"data_count": {"type": "integer", "description": "Number of data points analyzed"},
110+
"average": {"type": "number", "description": "Calculated average value"},
111+
"summary": {"type": "string", "description": "Summary of findings"},
99112
},
100-
"required": ["title", "summary"],
113+
"required": ["title", "data_count", "average", "summary"],
101114
"additionalProperties": False,
102115
},
103116
strict=True,
104117
)
105118

106-
# Create agent
119+
# Create agent with explicit instructions
107120
agent = project_client.agents.create_version(
108121
agent_name="code-func-report-agent",
109122
definition=PromptAgentDefinition(
110123
model=model,
111-
instructions="Generate data using code and create reports with the generate_report function.",
124+
instructions="You are a data analyst. Use code interpreter to generate and analyze data, then ALWAYS create a report using the generate_report function with the exact statistics you computed.",
112125
tools=[
113126
CodeInterpreterTool(container=CodeInterpreterToolAuto()),
114127
report_function,
@@ -118,9 +131,9 @@ def test_generate_data_and_report(self, **kwargs):
118131
)
119132
print(f"Agent created (id: {agent.id})")
120133

121-
# Request data generation and report
134+
# Request data generation and report - use a fixed seed for reproducibility in verification
122135
response = openai_client.responses.create(
123-
input="Generate a list of 10 random numbers between 1 and 100, calculate their average, and create a report.",
136+
input="Using Python with random.seed(42), generate exactly 10 random integers between 1 and 100, calculate their average, and create a report with the results.",
124137
extra_body={"agent": {"name": agent.name, "type": "agent_reference"}},
125138
)
126139

sdk/ai/azure-ai-projects/tests/agents/tools/multitool/test_agent_file_search_and_code_interpreter.py

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ class TestAgentFileSearchAndCodeInterpreter(TestBase):
2626
def test_find_and_analyze_data(self, **kwargs):
2727
"""
2828
Test finding data with File Search and analyzing with Code Interpreter.
29+
30+
This test verifies that both tools are used:
31+
1. File Search: Agent finds the data file containing numbers
32+
2. Code Interpreter: Agent calculates the average of those numbers
2933
"""
3034

3135
model = self.test_agents_params["model_deployment_name"]
@@ -34,25 +38,49 @@ def test_find_and_analyze_data(self, **kwargs):
3438
project_client = self.create_client(operation_group="agents", **kwargs)
3539
openai_client = project_client.get_openai_client()
3640

37-
# Create data file
38-
txt_content = "Sample data: 10, 20, 30, 40, 50"
41+
# Create data file with numbers that require actual computation
42+
# Numbers: 31, 20, 52, 48, 45, 34, 30, 86, 28, 71, 21, 20, 28, 44, 46
43+
# Sum: 604, Count: 15, Average: 40.266... ≈ 40.27
44+
# This is impossible to calculate mentally - requires Code Interpreter
45+
txt_content = """Sensor Readings Log - Experiment #2847
46+
47+
The following temperature readings (Celsius) were recorded over a 15-hour period:
48+
49+
Hour 1: 31
50+
Hour 2: 20
51+
Hour 3: 52
52+
Hour 4: 48
53+
Hour 5: 45
54+
Hour 6: 34
55+
Hour 7: 30
56+
Hour 8: 86
57+
Hour 9: 28
58+
Hour 10: 71
59+
Hour 11: 21
60+
Hour 12: 20
61+
Hour 13: 28
62+
Hour 14: 44
63+
Hour 15: 46
64+
65+
End of sensor log.
66+
"""
3967
vector_store = openai_client.vector_stores.create(name="DataStore")
4068

4169
txt_file = BytesIO(txt_content.encode("utf-8"))
42-
txt_file.name = "data.txt"
70+
txt_file.name = "sensor_readings.txt"
4371

4472
file = openai_client.vector_stores.files.upload_and_poll(
4573
vector_store_id=vector_store.id,
4674
file=txt_file,
4775
)
4876
print(f"File uploaded (id: {file.id})")
4977

50-
# Create agent
78+
# Create agent with explicit instructions to use both tools
5179
agent = project_client.agents.create_version(
5280
agent_name="file-search-code-agent",
5381
definition=PromptAgentDefinition(
5482
model=model,
55-
instructions="Find data and analyze it.",
83+
instructions="You are a data analyst. Use file search to find data files, then use code interpreter to perform calculations on the data.",
5684
tools=[
5785
FileSearchTool(vector_store_ids=[vector_store.id]),
5886
CodeInterpreterTool(container=CodeInterpreterToolAuto()),
@@ -62,9 +90,9 @@ def test_find_and_analyze_data(self, **kwargs):
6290
)
6391
print(f"Agent created (id: {agent.id})")
6492

65-
# Use the agent
93+
# Request that requires both tools: find data AND calculate
6694
response = openai_client.responses.create(
67-
input="Find the data file and calculate the average.",
95+
input="Find the sensor readings file and use code to calculate the average temperature. Show me the result.",
6896
extra_body={"agent": {"name": agent.name, "type": "agent_reference"}},
6997
)
7098
self.validate_response(response)
@@ -79,7 +107,11 @@ def test_find_and_analyze_data(self, **kwargs):
79107
@recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
80108
def test_analyze_code_file(self, **kwargs):
81109
"""
82-
Test finding code file and analyzing it.
110+
Test finding code file and running it with Code Interpreter.
111+
112+
This test verifies that both tools are used:
113+
1. File Search: Agent finds the Python code file
114+
2. Code Interpreter: Agent executes the code and returns the computed result
83115
"""
84116

85117
model = self.test_agents_params["model_deployment_name"]
@@ -88,14 +120,18 @@ def test_analyze_code_file(self, **kwargs):
88120
project_client = self.create_client(operation_group="agents", **kwargs)
89121
openai_client = project_client.get_openai_client()
90122

91-
# Create Python code file
92-
python_code = """def fibonacci(n):
123+
# Create Python code file with a function that computes a specific value
124+
# fibonacci(15) = 610 - this is not a commonly memorized value
125+
python_code = """# Fibonacci sequence calculator
126+
127+
def fibonacci(n):
128+
\"\"\"Calculate the nth Fibonacci number recursively.\"\"\"
93129
if n <= 1:
94130
return n
95131
return fibonacci(n-1) + fibonacci(n-2)
96132
97-
result = fibonacci(10)
98-
print(f"Fibonacci(10) = {result}")
133+
# The code needs to be executed to find what fibonacci(15) equals
134+
# This is not a commonly known value - it requires actual computation
99135
"""
100136

101137
vector_store = openai_client.vector_stores.create(name="CodeAnalysisStore")
@@ -109,37 +145,46 @@ def test_analyze_code_file(self, **kwargs):
109145
)
110146
print(f"Code file uploaded (id: {file.id})")
111147

112-
# Create agent
148+
# Create agent with explicit instructions to run code
113149
agent = project_client.agents.create_version(
114150
agent_name="file-search-code-analysis-agent",
115151
definition=PromptAgentDefinition(
116152
model=model,
117-
instructions="Find code files and analyze them. You can run code to test it.",
153+
instructions="You are a code analyst. Use file search to find code files, then use code interpreter to execute and test the code.",
118154
tools=[
119155
FileSearchTool(vector_store_ids=[vector_store.id]),
120156
CodeInterpreterTool(container=CodeInterpreterToolAuto()),
121157
],
122158
),
123-
description="Agent for code analysis.",
159+
description="Agent for code analysis and execution.",
124160
)
125161
print(f"Agent created (id: {agent.id})")
126162

127-
# Request analysis
163+
# Request that requires both tools: find code AND execute it
128164
response = openai_client.responses.create(
129-
input="Find the fibonacci code and explain what it does. What is the computational complexity?",
165+
input="Find the fibonacci code file and run it to calculate fibonacci(15). What is the result?",
130166
extra_body={"agent": {"name": agent.name, "type": "agent_reference"}},
131167
)
132168

133169
response_text = response.output_text
134-
print(f"Response: {response_text[:300]}...")
170+
print(f"Response: {response_text[:400]}...")
171+
172+
# Verify response is meaningful
173+
assert len(response_text) > 30, "Expected detailed response"
135174

136-
assert len(response_text) > 50
175+
# Verify File Search was used - response should reference the fibonacci code
137176
response_lower = response_text.lower()
138177
assert any(
139-
keyword in response_lower for keyword in ["fibonacci", "recursive", "complexity", "exponential"]
140-
), "Expected analysis of fibonacci algorithm"
178+
keyword in response_lower for keyword in ["fibonacci", "function", "recursive", "code"]
179+
), f"Expected response to reference the fibonacci code. Got: {response_text[:200]}"
180+
181+
# Verify Code Interpreter executed the code and got the correct result
182+
# fibonacci(15) = 610 - this requires actual execution
183+
assert "610" in response_text, f"Expected fibonacci(15) = 610 in response. Got: {response_text[:300]}"
141184

142-
print("✓ Code file analysis completed")
185+
print("[PASS] File Search + Code Interpreter both verified!")
186+
print(" - File Search: Found the fibonacci code file")
187+
print(" - Code Interpreter: Executed code and computed fibonacci(15) = 610")
143188

144189
# Cleanup
145190
project_client.agents.delete_version(agent_name=agent.name, agent_version=agent.version)

0 commit comments

Comments
 (0)