From 8f8155b4a566727b2e154c19a2483c9e209bfaa8 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Thu, 4 Dec 2025 16:56:03 -0700
Subject: [PATCH 01/36] add multiagent

---
 .gitignore                                 |   3 +-
 justfile                                   |   3 +
 src/ursa/experimental/agents/multiagent.py | 128 +++++++++++++++++++++
 3 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 src/ursa/experimental/agents/multiagent.py

diff --git a/.gitignore b/.gitignore
index 696f49eb..c43b5c40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,4 +31,5 @@ arxiv_papers
 *.sqfs
 ursa_workspace/
 .vscode/settings.json
-scratch/
\ No newline at end of file
+scratch/
+ursa-workspace/
diff --git a/justfile b/justfile
index ead5fb84..5a7a418e 100644
--- a/justfile
+++ b/justfile
@@ -107,3 +107,6 @@ shell:
 pygrep pattern:
     conda run --live-stream -n base watch \
         grep --exclude-dir=__pycache__ --exclude-dir=.venv -r '{{ pattern }}'
+
+python:
+    uv run ipython --no-autoindent
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
new file mode 100644
index 00000000..fa521dda
--- /dev/null
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -0,0 +1,128 @@
+from pathlib import Path
+from typing import Optional
+
+from langchain.agents import create_agent
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+from langchain.tools import tool
+from langgraph.checkpoint.base import BaseCheckpointSaver
+
+from ursa.agents import ExecutionAgent, PlanningAgent
+from ursa.util import Checkpointer
+
+system_prompt = """\
+You are an agent with multiple subagents and tools.
+
+These agents are available to you:
+
+* execution_agent
+  * Use this agent whenever you are asked to write/edit code or run arbitrary
+    commands from the command line.
+
+* planning_agent
+  * Use this agent whenever you are asked to plan out tasks.
+
+Note that if the user asks you to plan and then execute a task, you are 
+to iterate through each part (step or bullet point) of a task and then
+carry out the execution agent. Here is an example query:
+
+Please make a plan to print the first 10 natural numbers in python, then execute
+the code.
+
+For this query, a generated plan might look like this:
+
+```
+The user wants to compute the first 10 natural numbers in python. This is the plan.
+
+* step 1: write code
+* step 2: check that code is correct
+```
+
+In this case you should call the execution agent for step 1; and then call the
+execution agent for step 2. If more steps are in the plan, keep calling the
+execution agent.
+"""
+
+
+# NOTE: Is the solution to have a tool that breaks up the string plan, and then
+# execute each section of the plan?
+@tool
+def execute_plan(plan: str):
+    """Execute plan item by item."""
+    ...
+
+
+class Ursa:
+    def __init__(
+        self,
+        llm: BaseChatModel,
+        extra_tools: list = [],
+        workspace: Path = Path("ursa-workspace"),
+        checkpointer: Optional[BaseCheckpointSaver] = None,
+        thread_id: str = "ursa",
+        max_reflection_steps: int = 1,
+        system_prompt: str = system_prompt,
+    ):
+        self.llm = llm
+        self.extra_tools = extra_tools
+        self.workspace = workspace
+        self.checkpointer = checkpointer
+        self.thread_id = thread_id
+        self.system_prompt = system_prompt
+        self.max_reflection_steps = max_reflection_steps
+        self.checkpointer = checkpointer or Checkpointer.from_workspace(
+            workspace
+        )
+
+    def make_planning_tool(self):
+        planning_agent = PlanningAgent(
+            self.llm,
+            max_reflection_steps=self.max_reflection_steps,
+            thread_id=self.thread_id,
+        )
+
+        @tool(
+            "planning_agent",
+            description="Create plans for arbitrary tasks",
+        )
+        def call_agent(query: str):
+            result = planning_agent.invoke({
+                "messages": [HumanMessage(query)],
+            })
+            return result["messages"][-1].content
+
+        return call_agent
+
+    def make_execution_tool(self):
+        execution_agent = ExecutionAgent(self.llm, thread_id=self.thread_id)
+
+        @tool(
+            "execution_agent",
+            description="Read and edit scripts/code, and execute arbitrary commands on command line.",
+        )
+        def call_agent(query: str):
+            result = execution_agent.invoke({
+                "messages": [HumanMessage(query)],
+                "workspace": str(self.workspace),
+            })
+            return result["messages"][-1].content
+
+        return call_agent
+
+    def create(self, **kwargs):
+        """Create agent.
+
+        kwargs: for `create_agent`
+        """
+        self.subagents = [
+            self.make_execution_tool(),
+            self.make_planning_tool(),
+        ]
+        self.tools = self.subagents + self.extra_tools
+        return create_agent(
+            self.llm,
+            tools=self.tools,
+            system_prompt=self.system_prompt,
+            checkpointer=self.checkpointer,
+            **kwargs,
+        )

From f3856dfe12a8927c79b0a613adbb117e88190e12 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Fri, 5 Dec 2025 16:49:50 -0700
Subject: [PATCH 02/36] plan_execute_tool

---
 dev/.gitignore                             |   1 +
 dev/test.py                                |  87 +++++++++++
 justfile                                   |   1 +
 src/ursa/agents/execution_agent.py         |   3 +-
 src/ursa/experimental/agents/deep.py       |  42 ++++++
 src/ursa/experimental/agents/multiagent.py | 165 ++++++++++++---------
 6 files changed, 232 insertions(+), 67 deletions(-)
 create mode 100644 dev/.gitignore
 create mode 100644 dev/test.py
 create mode 100644 src/ursa/experimental/agents/deep.py

diff --git a/dev/.gitignore b/dev/.gitignore
new file mode 100644
index 00000000..0e51ad35
--- /dev/null
+++ b/dev/.gitignore
@@ -0,0 +1 @@
+ursa-workspace/
diff --git a/dev/test.py b/dev/test.py
new file mode 100644
index 00000000..008bb531
--- /dev/null
+++ b/dev/test.py
@@ -0,0 +1,87 @@
+import os
+
+import httpx
+from langchain.chat_models import init_chat_model
+from langchain.messages import HumanMessage
+from langchain_openai import ChatOpenAI
+from langgraph.checkpoint.memory import InMemorySaver
+from pydantic import SecretStr
+
+from ursa.experimental.agents.multiagent import Ursa
+
+aiportal = False
+
+if aiportal:
+    llm = ChatOpenAI(
+        model=os.environ["CLAUDE"],
+        # model="gpt-oss-120b",
+        base_url=os.environ["AIPORTAL_API_URL"],
+        api_key=SecretStr(os.environ["AIPORTAL_API_KEY"]),
+        http_client=httpx.Client(verify=False),
+    )
+else:
+    # llm = init_chat_model("ollama:ministral-3:14b")
+    llm = init_chat_model("openai:gpt-5-nano")
+
+
+agent = Ursa(
+    llm,
+    max_reflection_steps=0,
+    checkpointer=InMemorySaver(),
+).create()
+
+results = []
+
+
+def run(query: str):
+    print(f"Task:\n{query}")
+    results.append(
+        result := agent.invoke(
+            {"messages": [HumanMessage(query)]},
+            {
+                "configurable": {
+                    "thread_id": "ursa",
+                },
+                "recursion_limit": 50,
+            },
+        )
+    )
+    return result
+
+
+# run(
+#     "Write and execute a very minimal python script to compute Pi using Monte Carlo."
+# )
+
+# run("What did you just do?")
+
+# print(results)
+
+
+# run(
+#     "Write a plan to write a very minimal python script to compute Pi using Monte Carlo."
+#     "After planning, please execute the plan step by step. Save any code to disk."
+# )
+
+# run("Can you now execute the plan?")
+
+query = """
+I have a file `data/data.csv`. 
+
+**First**, read the first few lines of the file to understand the format.
+Do this quickly; don't go overboard.
+
+**Then**, write a plan (with at most 3 steps) to perform simple linear
+regression on this data in python. The linear regression must have a slope and
+intercept. The plan MUST NOT include code; though it may include instruction
+to write code.  The analysis should be **very minimal** and AS CONCISE AS
+POSSIBLE.
+
+**Finally**, EXECUTE THE PLAN using execute_plan_tool. Write any code to
+*`output/`. DO NOT write anything to `data/`.
+"""
+run(query)
+
+for result in results:
+    for msg in result["messages"]:
+        msg.pretty_print()
diff --git a/justfile b/justfile
index 5a7a418e..0740f98c 100644
--- a/justfile
+++ b/justfile
@@ -108,5 +108,6 @@ pygrep pattern:
     conda run --live-stream -n base watch \
         grep --exclude-dir=__pycache__ --exclude-dir=.venv -r '{{ pattern }}'
 
+[no-cd]
 python:
     uv run ipython --no-autoindent
diff --git a/src/ursa/agents/execution_agent.py b/src/ursa/agents/execution_agent.py
index 6260c5f3..e46509f2 100644
--- a/src/ursa/agents/execution_agent.py
+++ b/src/ursa/agents/execution_agent.py
@@ -48,6 +48,7 @@
     ToolMessage,
 )
 from langchain_core.tools import StructuredTool
+from langchain_core.tools.base import BaseTool
 from langchain_mcp_adapters.client import MultiServerMCPClient
 from langgraph.graph import StateGraph
 from langgraph.graph.message import add_messages
@@ -221,7 +222,7 @@ def __init__(
         llm: BaseChatModel,
         agent_memory: Optional[Any | AgentMemory] = None,
         log_state: bool = False,
-        extra_tools: Optional[list[Callable[..., Any]]] = None,
+        extra_tools: Optional[list[BaseTool]] = None,
         tokens_before_summarize: int = 50000,
         messages_to_keep: int = 20,
         safe_codes: Optional[list[str]] = None,
diff --git a/src/ursa/experimental/agents/deep.py b/src/ursa/experimental/agents/deep.py
new file mode 100644
index 00000000..372799cd
--- /dev/null
+++ b/src/ursa/experimental/agents/deep.py
@@ -0,0 +1,42 @@
+from deepagents import CompiledSubAgent, create_deep_agent
+from langchain.chat_models import init_chat_model
+from langchain.messages import HumanMessage
+
+from ursa.agents import ExecutionAgent
+
+llm = init_chat_model("openai:gpt-5-nano")
+exe_graph = ExecutionAgent(llm=llm)._action
+
+# # Create a custom agent graph
+# custom_graph = create_agent(
+#     model=exe_graph,
+#     # tools=specialized_tools,
+#     system_prompt="You are a specialized agent for data analysis...",
+# )
+
+# Use it as a custom subagent
+exe_subagent = CompiledSubAgent(
+    name="executor",
+    description="Specialized agent for writing/executing code",
+    runnable=exe_graph,
+)
+
+subagents = [exe_subagent]
+
+agent = create_deep_agent(
+    model=llm,
+    # tools=[internet_search],
+    system_prompt="You are a data scientist. When asked to write code, use the executor agent.",
+    subagents=[exe_subagent],
+)
+
+
+results = []
+
+
+def run(query: str):
+    results.append(result := agent.invoke({"messages": [HumanMessage(query)]}))
+    return result
+
+
+run("Write a very minimal python script to compute Pi using Monte Carlo.")
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index fa521dda..03f5b55b 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Optional
 
+import yaml
 from langchain.agents import create_agent
 from langchain.chat_models import BaseChatModel
 from langchain.messages import HumanMessage
@@ -11,45 +12,109 @@
 from ursa.util import Checkpointer
 
 system_prompt = """\
-You are an agent with multiple subagents and tools.
+You are an data scientist with multiple tools.
 
-These agents are available to you:
-
-* execution_agent
-  * Use this agent whenever you are asked to write/edit code or run arbitrary
-    commands from the command line.
+These tools are available to you:
 
 * planning_agent
-  * Use this agent whenever you are asked to plan out tasks.
-
-Note that if the user asks you to plan and then execute a task, you are 
-to iterate through each part (step or bullet point) of a task and then
-carry out the execution agent. Here is an example query:
-
-Please make a plan to print the first 10 natural numbers in python, then execute
-the code.
+  * Use this tool whenever you are asked to plan out tasks.
+  * In each step of your plan, if code needs to be generated, please 
+    explicitly state in the step that code needs to be written and executed.
 
-For this query, a generated plan might look like this:
-
-```
-The user wants to compute the first 10 natural numbers in python. This is the plan.
-
-* step 1: write code
-* step 2: check that code is correct
-```
+* execution_agent
+  * Use this tool **whenever** you are asked to write/edit code or run arbitrary
+    commands from the command line.
 
-In this case you should call the execution agent for step 1; and then call the
-execution agent for step 2. If more steps are in the plan, keep calling the
-execution agent.
+* execute_plan_tool
+  * Use this tool if you are asked to execute a plan that starts with <PLAN> and ends with </PLAN>.
+  * Do not use this tool if the <PLAN></PLAN> tags are not present in the instruction!
 """
 
 
 # NOTE: Is the solution to have a tool that breaks up the string plan, and then
 # execute each section of the plan?
-@tool
-def execute_plan(plan: str):
-    """Execute plan item by item."""
-    ...
+def make_execute_plan_tool(llm: BaseChatModel, workspace: Path):
+    execution_agent = ExecutionAgent(llm)._action
+
+    @tool(
+        "execute_plan_tool",
+        description="Execute a plan from the planning agent tool.",
+    )
+    def execute_plan(plan: str):
+        """Execute plan item by item."""
+
+        print("EXECUTING PLAN")
+        if plan.startswith("<PLAN>") and plan.endswith("</PLAN>"):
+            summaries = []
+
+            plan_steps = yaml.safe_load(
+                plan.replace("<PLAN>", "").replace("</PLAN>", "").strip()
+            )
+            for step in plan_steps:
+                step_prompt = "You are contributing to a larger solution.\n\n"
+                if len(summaries) > 0:
+                    last_step_summary = summaries[-1]
+                    step_prompt += (
+                        f"Previous-step summary: {last_step_summary}\n\n"
+                    )
+                step_prompt += f"Now, execute the following step (and if you write any code, be sure to execute the code to make sure it works):\n{yaml.dump(step)}"
+                print(step_prompt)
+
+                result = execution_agent.invoke({
+                    "messages": [HumanMessage(step_prompt)],
+                    "workspace": str(workspace),
+                })
+                last_step_summary = result["messages"][-1].content
+                summaries.append(last_step_summary)
+            return "Grand summary of plan execution:\n\n" + "\n\n".join(
+                summaries
+            )
+        else:
+            return (
+                "Could not use `execute_plan` tool execute plan "
+                "as plan does not start/end with <PLAN>/</PLAN>."
+            )
+
+    return execute_plan
+
+
+def make_planning_tool(llm: BaseChatModel, max_reflection_steps: int):
+    planning_agent = PlanningAgent(
+        llm, max_reflection_steps=max_reflection_steps
+    )._action
+
+    @tool(
+        "planning_agent",
+        description="Create plans for arbitrary tasks",
+    )
+    def call_agent(query: str):
+        result = planning_agent.invoke({
+            "messages": [HumanMessage(query)],
+            "reflection_steps": max_reflection_steps,
+        })
+        # return result["messages"][-1].content
+        plan = f"<PLAN>\n{yaml.dump(result['plan_steps'])}\n</PLAN>"
+        print(plan)
+        return plan
+
+    return call_agent
+
+
+def make_execution_tool(llm: BaseChatModel, workspace: Path):
+    execution_agent = ExecutionAgent(llm)._action
+
+    @tool(
+        "execution_agent",
+        description="Read and edit scripts/code, and execute arbitrary commands on command line.",
+    )
+    def call_agent(query: str):
+        result = execution_agent.invoke({
+            "messages": [HumanMessage(query)],
+            "workspace": str(workspace),
+        })
+        return result["messages"][-1].content
+
+    return call_agent
 
 
 class Ursa:
@@ -74,49 +139,17 @@ def __init__(
             workspace
         )
 
-    def make_planning_tool(self):
-        planning_agent = PlanningAgent(
-            self.llm,
-            max_reflection_steps=self.max_reflection_steps,
-            thread_id=self.thread_id,
-        )
-
-        @tool(
-            "planning_agent",
-            description="Create plans for arbitrary tasks",
-        )
-        def call_agent(query: str):
-            result = planning_agent.invoke({
-                "messages": [HumanMessage(query)],
-            })
-            return result["messages"][-1].content
-
-        return call_agent
-
-    def make_execution_tool(self):
-        execution_agent = ExecutionAgent(self.llm, thread_id=self.thread_id)
-
-        @tool(
-            "execution_agent",
-            description="Read and edit scripts/code, and execute arbitrary commands on command line.",
-        )
-        def call_agent(query: str):
-            result = execution_agent.invoke({
-                "messages": [HumanMessage(query)],
-                "workspace": str(self.workspace),
-            })
-            return result["messages"][-1].content
-
-        return call_agent
-
     def create(self, **kwargs):
         """Create agent.
 
         kwargs: for `create_agent`
         """
         self.subagents = [
-            self.make_execution_tool(),
-            self.make_planning_tool(),
+            make_execution_tool(llm=self.llm, workspace=self.workspace),
+            make_planning_tool(
+                llm=self.llm, max_reflection_steps=self.max_reflection_steps
+            ),
+            make_execute_plan_tool(llm=self.llm, workspace=self.workspace),
         ]
         self.tools = self.subagents + self.extra_tools
         return create_agent(

From 066cedc023e683877cae7605eb69b394e5de0a08 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Fri, 5 Dec 2025 17:02:45 -0700
Subject: [PATCH 03/36] yes

---
 dev/test.py                                | 5 +++--
 src/ursa/experimental/agents/multiagent.py | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/dev/test.py b/dev/test.py
index 008bb531..84bee474 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -77,8 +77,9 @@ def run(query: str):
 to write code.  The analysis should be **very minimal** and AS CONCISE AS
 POSSIBLE.
 
-**Finally**, EXECUTE THE PLAN using execute_plan_tool. Write any code to
-*`output/`. DO NOT write anything to `data/`.
+**Finally**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
+*`output/analysis.py`. DO NOT write anything to `data/`.  Do not write any other
+*files. I want a single file with the entire analysis.
 """
 run(query)
 
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 03f5b55b..ab99dd77 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -1,7 +1,7 @@
+import json
 from pathlib import Path
 from typing import Optional
 
-import yaml
 from langchain.agents import create_agent
 from langchain.chat_models import BaseChatModel
 from langchain.messages import HumanMessage
@@ -47,7 +47,7 @@ def execute_plan(plan: str):
         if plan.startswith("<PLAN>") and plan.endswith("</PLAN>"):
             summaries = []
 
-            plan_steps = yaml.safe_load(
+            plan_steps = json.loads(
                 plan.replace("<PLAN>", "").replace("</PLAN>", "").strip()
             )
             for step in plan_steps:
@@ -57,7 +57,7 @@ def execute_plan(plan: str):
                     step_prompt += (
                         f"Previous-step summary: {last_step_summary}\n\n"
                     )
-                step_prompt += f"Now, execute the following step (and if you write any code, be sure to execute the code to make sure it works):\n{yaml.dump(step)}"
+                step_prompt += f"Now, execute the following step (and if you write any code, be sure to execute the code to make sure it works):\n{json.dumps(step)}"
                 print(step_prompt)
 
                 result = execution_agent.invoke({
@@ -93,7 +93,7 @@ def call_agent(query: str):
             "reflection_steps": max_reflection_steps,
         })
         # return result["messages"][-1].content
-        plan = f"<PLAN>\n{yaml.dump(result['plan_steps'])}\n</PLAN>"
+        plan = f"<PLAN>\n{json.dumps(result['plan_steps'])}\n</PLAN>"
         print(plan)
         return plan
 

From 9b346942350b40d103b1d8b4986724b12c028c58 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Fri, 5 Dec 2025 17:17:05 -0700
Subject: [PATCH 04/36] yes

---
 dev/test.py                                |  4 ++--
 src/ursa/experimental/agents/multiagent.py | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/dev/test.py b/dev/test.py
index 84bee474..7fbb6954 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -78,8 +78,8 @@ def run(query: str):
 POSSIBLE.
 
 **Finally**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
-*`output/analysis.py`. DO NOT write anything to `data/`.  Do not write any other
-*files. I want a single file with the entire analysis.
+*`analysis.py`. DO NOT write anything to `data/`.  Do not write any other
+files. I want a single file with the entire analysis.
 """
 run(query)
 
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index ab99dd77..bb29f715 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -47,11 +47,15 @@ def execute_plan(plan: str):
         if plan.startswith("<PLAN>") and plan.endswith("</PLAN>"):
             summaries = []
 
-            plan_steps = json.loads(
+            task_and_plan_steps = json.loads(
                 plan.replace("<PLAN>", "").replace("</PLAN>", "").strip()
             )
+            task = task_and_plan_steps[0]["task"]
+            plan_steps = task_and_plan_steps[1:]
             for step in plan_steps:
-                step_prompt = "You are contributing to a larger solution.\n\n"
+                step_prompt = (
+                    f"You are contributing to a larger solution:\n{task}.\n\n"
+                )
                 if len(summaries) > 0:
                     last_step_summary = summaries[-1]
                     step_prompt += (
@@ -93,7 +97,8 @@ def call_agent(query: str):
             "reflection_steps": max_reflection_steps,
         })
         # return result["messages"][-1].content
-        plan = f"<PLAN>\n{json.dumps(result['plan_steps'])}\n</PLAN>"
+        plan_steps = [{"task": query}] + result["plan_steps"]
+        plan = f"<PLAN>\n{json.dumps(plan_steps)}\n</PLAN>"
         print(plan)
         return plan
 

From fb2d2e6322851e17c1177d300e1242cce26c2005 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Sun, 7 Dec 2025 12:20:05 -0700
Subject: [PATCH 05/36] yes

---
 dev/test.py                                | 2 +-
 src/ursa/experimental/agents/multiagent.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/dev/test.py b/dev/test.py
index 7fbb6954..f4a9807d 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -9,7 +9,7 @@
 
 from ursa.experimental.agents.multiagent import Ursa
 
-aiportal = False
+aiportal = True
 
 if aiportal:
     llm = ChatOpenAI(
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index bb29f715..7e0cfe05 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -28,6 +28,10 @@
 * execute_plan_tool
   * Use this tool if you are asked to execute a plan that starts with <PLAN> and ends with </PLAN>.
   * Do not use this tool if the <PLAN></PLAN> tags are not present in the instruction!
+
+Note that this project is managed by `uv. So, if you need to execute python
+code, you MUST run `uv run path/to/file.py`. 
+DO NOT run `python /path/to/file.py` or `python3 /path/to/file.py`.
 """
 
 

From 659eeea898991d04d078fa18fc66da0133fca67f Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 8 Dec 2025 12:42:52 -0700
Subject: [PATCH 06/36] improve demo

---
 dev/.gitignore                             |  1 -
 dev/dev-workspace/.gitignore               |  2 ++
 dev/test.py                                |  4 ++-
 src/ursa/experimental/agents/multiagent.py | 29 ++++++++++++++--------
 4 files changed, 23 insertions(+), 13 deletions(-)
 delete mode 100644 dev/.gitignore
 create mode 100644 dev/dev-workspace/.gitignore

diff --git a/dev/.gitignore b/dev/.gitignore
deleted file mode 100644
index 0e51ad35..00000000
--- a/dev/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-ursa-workspace/
diff --git a/dev/dev-workspace/.gitignore b/dev/dev-workspace/.gitignore
new file mode 100644
index 00000000..352cdd3d
--- /dev/null
+++ b/dev/dev-workspace/.gitignore
@@ -0,0 +1,2 @@
+*
+!data.csv
diff --git a/dev/test.py b/dev/test.py
index f4a9807d..6c668f3f 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 
 import httpx
 from langchain.chat_models import init_chat_model
@@ -9,7 +10,7 @@
 
 from ursa.experimental.agents.multiagent import Ursa
 
-aiportal = True
+aiportal = False
 
 if aiportal:
     llm = ChatOpenAI(
@@ -27,6 +28,7 @@
 agent = Ursa(
     llm,
     max_reflection_steps=0,
+    workspace=Path("dev-workspace"),
     checkpointer=InMemorySaver(),
 ).create()
 
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 7e0cfe05..6eec769f 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Optional
 
+import yaml
 from langchain.agents import create_agent
 from langchain.chat_models import BaseChatModel
 from langchain.messages import HumanMessage
@@ -12,26 +13,28 @@
 from ursa.util import Checkpointer
 
 system_prompt = """\
-You are an data scientist with multiple tools.
+You are a data scientist with multiple tools.
 
 These tools are available to you:
 
 * planning_agent
   * Use this tool whenever you are asked to plan out tasks.
-  * In each step of your plan, if code needs to be generated, please 
-    explicitly state in the step that code needs to be written and executed.
+  * In each step of your plan, if code needs to be generated, please explicitly
+    state in the step that code needs to be written and executed.
 
 * execution_agent
-  * Use this tool **whenever** you are asked to write/edit code or run arbitrary
-    commands from the command line.
+  * Use this tool **whenever** you are asked to write/edit code or run
+    arbitrary commands from the command line.
 
 * execute_plan_tool
-  * Use this tool if you are asked to execute a plan that starts with <PLAN> and ends with </PLAN>.
-  * Do not use this tool if the <PLAN></PLAN> tags are not present in the instruction!
+  * Use this tool if you are asked to execute a plan that starts with <PLAN>
+    and ends with </PLAN>.
+  * Do not use this tool if the <PLAN></PLAN> tags are not present in the
+    instruction!
 
 Note that this project is managed by `uv. So, if you need to execute python
-code, you MUST run `uv run path/to/file.py`. 
-DO NOT run `python /path/to/file.py` or `python3 /path/to/file.py`.
+code, you MUST run `uv run path/to/file.py`. DO NOT run `python
+/path/to/file.py` or `python3 /path/to/file.py`.
 """
 
 
@@ -58,14 +61,18 @@ def execute_plan(plan: str):
             plan_steps = task_and_plan_steps[1:]
             for step in plan_steps:
                 step_prompt = (
-                    f"You are contributing to a larger solution:\n{task}.\n\n"
+                    f"You are contributing to a larger solution:\n{task}\n\n"
                 )
                 if len(summaries) > 0:
                     last_step_summary = summaries[-1]
                     step_prompt += (
                         f"Previous-step summary: {last_step_summary}\n\n"
                     )
-                step_prompt += f"Now, execute the following step (and if you write any code, be sure to execute the code to make sure it works):\n{json.dumps(step)}"
+                step_prompt += (
+                    "Now, execute the following step (and if you write any "
+                    "code, be sure to execute the code to make sure it "
+                    f"works):\n{yaml.dump(step)}"
+                )
                 print(step_prompt)
 
                 result = execution_agent.invoke({

From 4173c11001b3f9a7278c7c7276b04382f2cdfb57 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 8 Dec 2025 12:43:19 -0700
Subject: [PATCH 07/36] yes

---
 dev/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/test.py b/dev/test.py
index 6c668f3f..f98cf29c 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -67,6 +67,7 @@ def run(query: str):
 
 # run("Can you now execute the plan?")
 
+# TODO: Need to make `uv run` a SAFE command.
 query = """
 I have a file `data/data.csv`. 
 

From aa68c1a28a339785cd59d57b6bd0ae7deb7c4b1b Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 8 Dec 2025 13:01:23 -0700
Subject: [PATCH 08/36] yes

---
 dev/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/test.py b/dev/test.py
index f98cf29c..866e6f12 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -22,7 +22,7 @@
     )
 else:
     # llm = init_chat_model("ollama:ministral-3:14b")
-    llm = init_chat_model("openai:gpt-5-nano")
+    llm = init_chat_model("openai:gpt-5")
 
 
 agent = Ursa(

From 983acba3d82308db44635862b158b28a96fcc96e Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 8 Dec 2025 13:06:06 -0700
Subject: [PATCH 09/36] yes

---
 dev/test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/test.py b/dev/test.py
index 866e6f12..a27d8c4d 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -80,9 +80,11 @@ def run(query: str):
 to write code.  The analysis should be **very minimal** and AS CONCISE AS
 POSSIBLE.
 
-**Finally**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
+**Then**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
 *`analysis.py`. DO NOT write anything to `data/`.  Do not write any other
 files. I want a single file with the entire analysis.
+
+**Finally**, Edit *`analysis.py` to make it AS CONCISE AS POSSIBLE.
 """
 run(query)
 

From fe394d5b71ad627bcf67fdf68b65097b19917dbc Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 8 Dec 2025 13:06:40 -0700
Subject: [PATCH 10/36] yes

---
 dev/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/test.py b/dev/test.py
index a27d8c4d..246c75e2 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -84,7 +84,7 @@ def run(query: str):
 *`analysis.py`. DO NOT write anything to `data/`.  Do not write any other
 files. I want a single file with the entire analysis.
 
-**Finally**, Edit *`analysis.py` to make it AS CONCISE AS POSSIBLE.
+**Finally**, edit *`analysis.py` to make it AS CONCISE AS POSSIBLE.
 """
 run(query)
 

From 598309f0155b2a3e64709d8174063e7c50a14e00 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Tue, 9 Dec 2025 11:37:29 -0700
Subject: [PATCH 11/36] add todo for input/output control between agents

---
 src/ursa/experimental/agents/multiagent.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 6eec769f..b463c67a 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -40,6 +40,8 @@
 
 # NOTE: Is the solution to have a tool that breaks up the string plan, and then
 # execute each section of the plan?
+# TODO: Try doing this instead:
+# https://docs.langchain.com/oss/python/langchain/multi-agent#where-to-customize
 def make_execute_plan_tool(llm: BaseChatModel, workspace: Path):
     execution_agent = ExecutionAgent(llm)._action
 

From 79fc6cc58d39b82296298bbda27b52f287fe209d Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Tue, 9 Dec 2025 12:33:20 -0700
Subject: [PATCH 12/36] yes

---
 dev/dev-workspace/.gitignore | 2 --
 dev/justfile                 | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)
 delete mode 100644 dev/dev-workspace/.gitignore
 create mode 100644 dev/justfile

diff --git a/dev/dev-workspace/.gitignore b/dev/dev-workspace/.gitignore
deleted file mode 100644
index 352cdd3d..00000000
--- a/dev/dev-workspace/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!data.csv
diff --git a/dev/justfile b/dev/justfile
new file mode 100644
index 00000000..f64b8998
--- /dev/null
+++ b/dev/justfile
@@ -0,0 +1,6 @@
+help:
+    just -l -u
+
+clean:
+    rm -rf dev-workspace/__pycache__ dev-workspace/results
+    find dev-workspace -mindepth 1 -not -name data.csv -exec rm -f {} +

From 5beaa26c68df0e120c3b0edb5c26054357833de1 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Tue, 9 Dec 2025 12:45:56 -0700
Subject: [PATCH 13/36] yes

---
 dev/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/test.py b/dev/test.py
index 246c75e2..63c9a170 100644
--- a/dev/test.py
+++ b/dev/test.py
@@ -74,10 +74,10 @@ def run(query: str):
 **First**, read the first few lines of the file to understand the format.
 Do this quickly; don't go overboard.
 
-**Then**, write a plan (with at most 3 steps) to perform simple linear
+**Then**, write a plan (with at most 4 steps) to perform simple linear
 regression on this data in python. The linear regression must have a slope and
 intercept. The plan MUST NOT include code; though it may include instruction
-to write code.  The analysis should be **very minimal** and AS CONCISE AS
+to write code. The analysis should be **very minimal** and AS CONCISE AS
 POSSIBLE.
 
 **Then**, EXECUTE THE PLAN using execute_plan_tool. Write all code to

From a11869580cefcc8d0d187e8c696d4c1ba55dac63 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Fri, 12 Dec 2025 10:01:08 -0700
Subject: [PATCH 14/36] yes

---
 dev/.gitignore                             |  1 +
 dev/justfile                               |  6 ++++++
 dev/{test.py => run.py}                    | 17 ++++++++++-------
 src/ursa/experimental/agents/multiagent.py | 22 ++++++++++++++--------
 4 files changed, 31 insertions(+), 15 deletions(-)
 create mode 100644 dev/.gitignore
 rename dev/{test.py => run.py} (75%)

diff --git a/dev/.gitignore b/dev/.gitignore
new file mode 100644
index 00000000..76fa7ff5
--- /dev/null
+++ b/dev/.gitignore
@@ -0,0 +1 @@
+dev-workspace
diff --git a/dev/justfile b/dev/justfile
index f64b8998..b1442c8f 100644
--- a/dev/justfile
+++ b/dev/justfile
@@ -4,3 +4,9 @@ help:
 clean:
     rm -rf dev-workspace/__pycache__ dev-workspace/results
     find dev-workspace -mindepth 1 -not -name data.csv -exec rm -f {} +
+
+run:
+    uv run run.py
+
+test:
+    cd dev-workspace && uv run analysis.py
diff --git a/dev/test.py b/dev/run.py
similarity index 75%
rename from dev/test.py
rename to dev/run.py
index 63c9a170..58f954b9 100644
--- a/dev/test.py
+++ b/dev/run.py
@@ -22,7 +22,7 @@
     )
 else:
     # llm = init_chat_model("ollama:ministral-3:14b")
-    llm = init_chat_model("openai:gpt-5")
+    llm = init_chat_model("openai:gpt-5.2")
 
 
 agent = Ursa(
@@ -75,16 +75,19 @@ def run(query: str):
 Do this quickly; don't go overboard.
 
 **Then**, write a plan (with at most 4 steps) to perform simple linear
-regression on this data in python. The linear regression must have a slope and
-intercept. The plan MUST NOT include code; though it may include instruction
-to write code. The analysis should be **very minimal** and AS CONCISE AS
-POSSIBLE.
+regression on this data in python. I care only about the coefficients. Do not
+provide other information or plots. The plan MUST NOT include code; though it
+may include instruction to write code. The analysis should be **very minimal**
+and AS CONCISE AS POSSIBLE.
 
 **Then**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
-*`analysis.py`. DO NOT write anything to `data/`.  Do not write any other
+`analysis.py`. DO NOT write anything to `data/`. Do not write any other
 files. I want a single file with the entire analysis.
 
-**Finally**, edit *`analysis.py` to make it AS CONCISE AS POSSIBLE.
+**Finally**, edit `analysis.py` to make it AS CONCISE AS POSSIBLE. Don't
+include code for assert, plots, etc. I want ONLY a very minimal script that
+reads the data and then prints the linear model's coefficients. Remember, I
+want A SINGLE FILE with the entire analysis (in `analysis.py`).
 """
 run(query)
 
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index b463c67a..0a753980 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -38,6 +38,10 @@
 """
 
 
+def tag(tag_name: str, content: str):
+    return f"\n<{tag_name}>\n{content}\n</{tag_name}>\n\n"
+
+
 # NOTE: Is the solution to have a tool that breaks up the string plan, and then
 # execute each section of the plan?
 # TODO: Try doing this instead:
@@ -63,18 +67,20 @@ def execute_plan(plan: str):
             plan_steps = task_and_plan_steps[1:]
             for step in plan_steps:
                 step_prompt = (
-                    f"You are contributing to a larger solution:\n{task}\n\n"
+                    "You are contributing a solution to the following overall plan. "
+                    "The overall plan, last step's summary, and next step are as follows."
+                    "With this information, please carry out the next step. "
+                    "IF you write any code, be sure to execute the code to make "
+                    "sure it properly runs."
                 )
+                step_prompt += tag("OVERALL_PLAN", task)
                 if len(summaries) > 0:
                     last_step_summary = summaries[-1]
-                    step_prompt += (
-                        f"Previous-step summary: {last_step_summary}\n\n"
+                    step_prompt += tag(
+                        "SUMMARY_OF_LAST_STEP", last_step_summary
                     )
-                step_prompt += (
-                    "Now, execute the following step (and if you write any "
-                    "code, be sure to execute the code to make sure it "
-                    f"works):\n{yaml.dump(step)}"
-                )
+
+                step_prompt += tag("NEXT_STEP", yaml.dump(step))
                 print(step_prompt)
 
                 result = execution_agent.invoke({

From 45185795de764b72dd68a09a920ba9f2947f2f62 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Fri, 12 Dec 2025 11:39:58 -0700
Subject: [PATCH 15/36] format

---
 src/ursa/experimental/agents/multiagent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 0a753980..20196053 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -80,7 +80,7 @@ def execute_plan(plan: str):
                         "SUMMARY_OF_LAST_STEP", last_step_summary
                     )
 
-                step_prompt += tag("NEXT_STEP", yaml.dump(step))
+                step_prompt += tag("NEXT_STEP", yaml.dump(step).strip())
                 print(step_prompt)
 
                 result = execution_agent.invoke({
@@ -118,7 +118,7 @@ def call_agent(query: str):
         # return result["messages"][-1].content
         plan_steps = [{"task": query}] + result["plan_steps"]
         plan = f"<PLAN>\n{json.dumps(plan_steps)}\n</PLAN>"
-        print(plan)
+        print(json.dumps(plan_steps, indent=4))
         return plan
 
     return call_agent

From 0500290d3499ec865eb8ae61236b324c981ada6d Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Fri, 12 Dec 2025 11:40:10 -0700
Subject: [PATCH 16/36] commit run.py

---
 dev/run.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/run.py b/dev/run.py
index 58f954b9..2daaddbd 100644
--- a/dev/run.py
+++ b/dev/run.py
@@ -75,10 +75,10 @@ def run(query: str):
 Do this quickly; don't go overboard.
 
 **Then**, write a plan (with at most 4 steps) to perform simple linear
-regression on this data in python. I care only about the coefficients. Do not
-provide other information or plots. The plan MUST NOT include code; though it
+regression on this data in python.  The plan MUST NOT include code; though it
 may include instruction to write code. The analysis should be **very minimal**
-and AS CONCISE AS POSSIBLE.
+and AS CONCISE AS POSSIBLE.  I care only about the coefficients (including an
+intercept). Do not provide other information or plots.
 
 **Then**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
 `analysis.py`. DO NOT write anything to `data/`. Do not write any other

From 6c6a54e42579c1a4d435728150a072c3614ae418 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Fri, 12 Dec 2025 12:10:14 -0700
Subject: [PATCH 17/36] better print

---
 dev/run.py                                 | 36 +++++++---------------
 src/ursa/experimental/agents/multiagent.py |  9 +++---
 2 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/dev/run.py b/dev/run.py
index 2daaddbd..18b66703 100644
--- a/dev/run.py
+++ b/dev/run.py
@@ -1,27 +1,28 @@
+# NOTE: This will be helpful for prompting.
+# https://cookbook.openai.com/examples/gpt-5/gpt-5_prompting_guide
+
 import os
 from pathlib import Path
 
 import httpx
 from langchain.chat_models import init_chat_model
 from langchain.messages import HumanMessage
-from langchain_openai import ChatOpenAI
 from langgraph.checkpoint.memory import InMemorySaver
-from pydantic import SecretStr
 
 from ursa.experimental.agents.multiagent import Ursa
 
 aiportal = False
 
 if aiportal:
-    llm = ChatOpenAI(
+    llm = init_chat_model(
         model=os.environ["CLAUDE"],
-        # model="gpt-oss-120b",
         base_url=os.environ["AIPORTAL_API_URL"],
-        api_key=SecretStr(os.environ["AIPORTAL_API_KEY"]),
+        api_key=os.environ["AIPORTAL_API_KEY"],
+        model_provider="openai",
+        model_kwargs={"extra_headers": {"disable_fallbacks": "true"}},
         http_client=httpx.Client(verify=False),
     )
 else:
-    # llm = init_chat_model("ollama:ministral-3:14b")
     llm = init_chat_model("openai:gpt-5.2")
 
 
@@ -51,22 +52,6 @@ def run(query: str):
     return result
 
 
-# run(
-#     "Write and execute a very minimal python script to compute Pi using Monte Carlo."
-# )
-
-# run("What did you just do?")
-
-# print(results)
-
-
-# run(
-#     "Write a plan to write a very minimal python script to compute Pi using Monte Carlo."
-#     "After planning, please execute the plan step by step. Save any code to disk."
-# )
-
-# run("Can you now execute the plan?")
-
 # TODO: Need to make `uv run` a SAFE command.
 query = """
 I have a file `data/data.csv`. 
@@ -85,9 +70,10 @@ def run(query: str):
 files. I want a single file with the entire analysis.
 
 **Finally**, edit `analysis.py` to make it AS CONCISE AS POSSIBLE. Don't
-include code for assert, plots, etc. I want ONLY a very minimal script that
-reads the data and then prints the linear model's coefficients. Remember, I
-want A SINGLE FILE with the entire analysis (in `analysis.py`).
+include code for assert, raising errors, exception handling, plots, etc. I want
+ONLY a very minimal script that reads the data and then prints the linear
+model's coefficients. Remember, I want A SINGLE FILE with the entire analysis
+(in `analysis.py`).
 """
 run(query)
 
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 20196053..7e25a038 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -67,9 +67,9 @@ def execute_plan(plan: str):
             plan_steps = task_and_plan_steps[1:]
             for step in plan_steps:
                 step_prompt = (
-                    "You are contributing a solution to the following overall plan. "
-                    "The overall plan, last step's summary, and next step are as follows."
-                    "With this information, please carry out the next step. "
+                    "You are contributing a solution of an overall plan. "
+                    "The overall plan, last step's summary, and next step are provided below. "
+                    "With the provided information, please carry out the next step. "
                     "IF you write any code, be sure to execute the code to make "
                     "sure it properly runs."
                 )
@@ -118,7 +118,8 @@ def call_agent(query: str):
         # return result["messages"][-1].content
         plan_steps = [{"task": query}] + result["plan_steps"]
         plan = f"<PLAN>\n{json.dumps(plan_steps)}\n</PLAN>"
-        print(json.dumps(plan_steps, indent=4))
+        # print(json.dumps(plan_steps, indent=4))
+        print(yaml.dump(plan_steps))
         return plan
 
     return call_agent

From 3cd1993b6d08301898167a876a78362a3cf782ed Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 15 Dec 2025 14:43:59 -0700
Subject: [PATCH 18/36] add multiagent test

---
 tests/agents/test_multiagent/.gitignore       |   1 +
 .../agents/test_multiagent/test_multiagent.py | 112 ++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 tests/agents/test_multiagent/.gitignore
 create mode 100644 tests/agents/test_multiagent/test_multiagent.py

diff --git a/tests/agents/test_multiagent/.gitignore b/tests/agents/test_multiagent/.gitignore
new file mode 100644
index 00000000..e9ed58f7
--- /dev/null
+++ b/tests/agents/test_multiagent/.gitignore
@@ -0,0 +1 @@
+workspace/
diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
new file mode 100644
index 00000000..c8f3dd55
--- /dev/null
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -0,0 +1,112 @@
+# NOTE: This will be helpful for prompting.
+# https://cookbook.openai.com/examples/gpt-5/gpt-5_prompting_guide
+
+
+import os
+from pathlib import Path
+
+import httpx
+from langchain.chat_models import init_chat_model
+from langchain.messages import HumanMessage
+from langgraph.checkpoint.memory import InMemorySaver
+
+from ursa.experimental.agents.multiagent import Ursa
+
+aiportal = False
+
+if aiportal:
+    llm = init_chat_model(
+        model=os.environ["CLAUDE"],
+        base_url=os.environ["AIPORTAL_API_URL"],
+        api_key=os.environ["AIPORTAL_API_KEY"],
+        model_provider="openai",
+        model_kwargs={"extra_headers": {"disable_fallbacks": "true"}},
+        http_client=httpx.Client(verify=False),
+    )
+else:
+    llm = init_chat_model("openai:gpt-5.2")
+
+
+def generate_data(data_path: Path):
+    import numpy as np
+    import pandas as pd
+
+    rng = np.random.default_rng(0)
+    x = rng.uniform(0, 1, 100)
+    y = rng.normal(2 * x + 1, 0.1)
+    pd.DataFrame(dict(x=x, y=y)).to_csv(data_path, index=False)
+
+
+workspace = Path(__file__).parent / "workspace"
+data_dir = workspace / "data"
+data_csv = data_dir / "data.csv"
+if not data_csv.exists():
+    data_dir.mkdir(exist_ok=True, parents=True)
+    generate_data(data_dir / "data.csv")
+
+agent = Ursa(
+    llm,
+    max_reflection_steps=0,
+    workspace=workspace,
+    checkpointer=InMemorySaver(),
+).create()
+
+results = []
+
+
+def run(query: str):
+    print(f"Task:\n{query}")
+    results.append(
+        result := agent.invoke(
+            {"messages": [HumanMessage(query)]},
+            {
+                "configurable": {
+                    "thread_id": "ursa",
+                },
+                "recursion_limit": 50,
+            },
+        )
+    )
+    return result
+
+
+# TODO: Need to make `uv run` a SAFE command.
+query_1 = """
+I have a file `data/data.csv`. 
+
+**First**, read the first few lines of the file to understand the format.
+Do this quickly; don't go overboard.
+
+**Then**, write a plan (with at most 4 steps) to perform simple linear
+regression on this data in python.  The plan MUST NOT include code; though it
+may include instruction to write code. The analysis should be **very minimal**
+and AS CONCISE AS POSSIBLE.  I care only about the coefficients (including an
+intercept). Do not provide other information or plots.
+
+**Then**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
+`analysis.py`. DO NOT write anything to `data/`. Do not write any other
+files. I want a single file with the entire analysis.
+
+**Finally**, edit `analysis.py` to make it AS CONCISE AS POSSIBLE. Don't
+include code for assert, raising errors, exception handling, plots, etc. I want
+ONLY a very minimal script that reads the data and then prints the linear
+model's coefficients. Remember, I want A SINGLE FILE with the entire analysis
+(in `analysis.py`).
+"""
+
+query_2 = """
+I have a file `data/data.csv`. 
+
+Please write a very minimal python script to perform linear regression on this
+data.  The analysis shoud be as concise as possible. I care only about the
+coefficients (including an intercept).  Do not provide other information or
+plots. Write the analysis to `analysis.py`. Run the code to ensure it works. 
+"""
+
+
+def test_multiagent():
+    run(query_1)
+
+    for result in results:
+        for msg in result["messages"]:
+            msg.pretty_print()

From 39f5d3802c4a00029a480c2e0019795e6bae0f7b Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 15 Dec 2025 14:44:25 -0700
Subject: [PATCH 19/36] remove dev

---
 dev/.gitignore |  1 -
 dev/justfile   | 12 --------
 dev/run.py     | 82 --------------------------------------------------
 3 files changed, 95 deletions(-)
 delete mode 100644 dev/.gitignore
 delete mode 100644 dev/justfile
 delete mode 100644 dev/run.py

diff --git a/dev/.gitignore b/dev/.gitignore
deleted file mode 100644
index 76fa7ff5..00000000
--- a/dev/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dev-workspace
diff --git a/dev/justfile b/dev/justfile
deleted file mode 100644
index b1442c8f..00000000
--- a/dev/justfile
+++ /dev/null
@@ -1,12 +0,0 @@
-help:
-    just -l -u
-
-clean:
-    rm -rf dev-workspace/__pycache__ dev-workspace/results
-    find dev-workspace -mindepth 1 -not -name data.csv -exec rm -f {} +
-
-run:
-    uv run run.py
-
-test:
-    cd dev-workspace && uv run analysis.py
diff --git a/dev/run.py b/dev/run.py
deleted file mode 100644
index 18b66703..00000000
--- a/dev/run.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# NOTE: This will be helpful for prompting.
-# https://cookbook.openai.com/examples/gpt-5/gpt-5_prompting_guide
-
-import os
-from pathlib import Path
-
-import httpx
-from langchain.chat_models import init_chat_model
-from langchain.messages import HumanMessage
-from langgraph.checkpoint.memory import InMemorySaver
-
-from ursa.experimental.agents.multiagent import Ursa
-
-aiportal = False
-
-if aiportal:
-    llm = init_chat_model(
-        model=os.environ["CLAUDE"],
-        base_url=os.environ["AIPORTAL_API_URL"],
-        api_key=os.environ["AIPORTAL_API_KEY"],
-        model_provider="openai",
-        model_kwargs={"extra_headers": {"disable_fallbacks": "true"}},
-        http_client=httpx.Client(verify=False),
-    )
-else:
-    llm = init_chat_model("openai:gpt-5.2")
-
-
-agent = Ursa(
-    llm,
-    max_reflection_steps=0,
-    workspace=Path("dev-workspace"),
-    checkpointer=InMemorySaver(),
-).create()
-
-results = []
-
-
-def run(query: str):
-    print(f"Task:\n{query}")
-    results.append(
-        result := agent.invoke(
-            {"messages": [HumanMessage(query)]},
-            {
-                "configurable": {
-                    "thread_id": "ursa",
-                },
-                "recursion_limit": 50,
-            },
-        )
-    )
-    return result
-
-
-# TODO: Need to make `uv run` a SAFE command.
-query = """
-I have a file `data/data.csv`. 
-
-**First**, read the first few lines of the file to understand the format.
-Do this quickly; don't go overboard.
-
-**Then**, write a plan (with at most 4 steps) to perform simple linear
-regression on this data in python.  The plan MUST NOT include code; though it
-may include instruction to write code. The analysis should be **very minimal**
-and AS CONCISE AS POSSIBLE.  I care only about the coefficients (including an
-intercept). Do not provide other information or plots.
-
-**Then**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
-`analysis.py`. DO NOT write anything to `data/`. Do not write any other
-files. I want a single file with the entire analysis.
-
-**Finally**, edit `analysis.py` to make it AS CONCISE AS POSSIBLE. Don't
-include code for assert, raising errors, exception handling, plots, etc. I want
-ONLY a very minimal script that reads the data and then prints the linear
-model's coefficients. Remember, I want A SINGLE FILE with the entire analysis
-(in `analysis.py`).
-"""
-run(query)
-
-for result in results:
-    for msg in result["messages"]:
-        msg.pretty_print()

From 4c8b8aa7f755f448ff4fc1b91a27f9c4222cf313 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 15 Dec 2025 14:44:56 -0700
Subject: [PATCH 20/36] remove deep agent

---
 src/ursa/experimental/agents/deep.py | 42 ----------------------------
 1 file changed, 42 deletions(-)
 delete mode 100644 src/ursa/experimental/agents/deep.py

diff --git a/src/ursa/experimental/agents/deep.py b/src/ursa/experimental/agents/deep.py
deleted file mode 100644
index 372799cd..00000000
--- a/src/ursa/experimental/agents/deep.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from deepagents import CompiledSubAgent, create_deep_agent
-from langchain.chat_models import init_chat_model
-from langchain.messages import HumanMessage
-
-from ursa.agents import ExecutionAgent
-
-llm = init_chat_model("openai:gpt-5-nano")
-exe_graph = ExecutionAgent(llm=llm)._action
-
-# # Create a custom agent graph
-# custom_graph = create_agent(
-#     model=exe_graph,
-#     # tools=specialized_tools,
-#     system_prompt="You are a specialized agent for data analysis...",
-# )
-
-# Use it as a custom subagent
-exe_subagent = CompiledSubAgent(
-    name="executor",
-    description="Specialized agent for writing/executing code",
-    runnable=exe_graph,
-)
-
-subagents = [exe_subagent]
-
-agent = create_deep_agent(
-    model=llm,
-    # tools=[internet_search],
-    system_prompt="You are a data scientist. When asked to write code, use the executor agent.",
-    subagents=[exe_subagent],
-)
-
-
-results = []
-
-
-def run(query: str):
-    results.append(result := agent.invoke({"messages": [HumanMessage(query)]}))
-    return result
-
-
-run("Write a very minimal python script to compute Pi using Monte Carlo.")

From 355b180b00b093236b2a16d8eb6c72e8b136933f Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 15 Dec 2025 14:51:32 -0700
Subject: [PATCH 21/36] yes

---
 src/ursa/experimental/agents/multiagent.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 7e25a038..25a34326 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -39,12 +39,11 @@
 
 
 def tag(tag_name: str, content: str):
+    """Wrap content in XML tag"""
     return f"\n<{tag_name}>\n{content}\n</{tag_name}>\n\n"
 
 
-# NOTE: Is the solution to have a tool that breaks up the string plan, and then
-# execute each section of the plan?
-# TODO: Try doing this instead:
+# NOTE: Resources
 # https://docs.langchain.com/oss/python/langchain/multi-agent#where-to-customize
 def make_execute_plan_tool(llm: BaseChatModel, workspace: Path):
     execution_agent = ExecutionAgent(llm)._action

From 52a4188bfb7ee39da93e41c793cd935cf3be677d Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Thu, 18 Dec 2025 14:44:10 -0700
Subject: [PATCH 22/36] add comments

---
 tests/agents/test_multiagent/test_multiagent.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index c8f3dd55..bf2b516f 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -37,6 +37,7 @@ def generate_data(data_path: Path):
     pd.DataFrame(dict(x=x, y=y)).to_csv(data_path, index=False)
 
 
+# Generate data if not already present.
 workspace = Path(__file__).parent / "workspace"
 data_dir = workspace / "data"
 data_csv = data_dir / "data.csv"
@@ -44,6 +45,7 @@ def generate_data(data_path: Path):
     data_dir.mkdir(exist_ok=True, parents=True)
     generate_data(data_dir / "data.csv")
 
+# Initialize agent.
 agent = Ursa(
     llm,
     max_reflection_steps=0,
@@ -51,6 +53,7 @@ def generate_data(data_path: Path):
     checkpointer=InMemorySaver(),
 ).create()
 
+# Store results (AI output) in this list.
 results = []
 
 
@@ -94,6 +97,7 @@ def run(query: str):
 (in `analysis.py`).
 """
 
+# An alternate query to test.
 query_2 = """
 I have a file `data/data.csv`. 
 

From 187da3bb4844662b1b4f3234cd003776ce12b027 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Thu, 18 Dec 2025 17:22:05 -0700
Subject: [PATCH 23/36] update model

---
 .../agents/test_multiagent/test_multiagent.py | 75 +++++++++----------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index bf2b516f..c86348ba 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -12,9 +12,9 @@
 
 from ursa.experimental.agents.multiagent import Ursa
 
-aiportal = False
+use_aiportal = False
 
-if aiportal:
+if use_aiportal:
     llm = init_chat_model(
         model=os.environ["CLAUDE"],
         base_url=os.environ["AIPORTAL_API_URL"],
@@ -24,6 +24,7 @@
         http_client=httpx.Client(verify=False),
     )
 else:
+    # Use openai
     llm = init_chat_model("openai:gpt-5.2")
 
 
@@ -37,42 +38,6 @@ def generate_data(data_path: Path):
     pd.DataFrame(dict(x=x, y=y)).to_csv(data_path, index=False)
 
 
-# Generate data if not already present.
-workspace = Path(__file__).parent / "workspace"
-data_dir = workspace / "data"
-data_csv = data_dir / "data.csv"
-if not data_csv.exists():
-    data_dir.mkdir(exist_ok=True, parents=True)
-    generate_data(data_dir / "data.csv")
-
-# Initialize agent.
-agent = Ursa(
-    llm,
-    max_reflection_steps=0,
-    workspace=workspace,
-    checkpointer=InMemorySaver(),
-).create()
-
-# Store results (AI output) in this list.
-results = []
-
-
-def run(query: str):
-    print(f"Task:\n{query}")
-    results.append(
-        result := agent.invoke(
-            {"messages": [HumanMessage(query)]},
-            {
-                "configurable": {
-                    "thread_id": "ursa",
-                },
-                "recursion_limit": 50,
-            },
-        )
-    )
-    return result
-
-
 # TODO: Need to make `uv run` a SAFE command.
 query_1 = """
 I have a file `data/data.csv`. 
@@ -109,6 +74,40 @@ def run(query: str):
 
 
 def test_multiagent():
+    # Generate data if not already present.
+    workspace = Path(__file__).parent / "workspace"
+    data_dir = workspace / "data"
+    data_csv = data_dir / "data.csv"
+    if not data_csv.exists():
+        data_dir.mkdir(exist_ok=True, parents=True)
+        generate_data(data_dir / "data.csv")
+
+    # Initialize agent.
+    agent = Ursa(
+        llm,
+        max_reflection_steps=0,
+        workspace=workspace,
+        checkpointer=InMemorySaver(),
+    ).create()
+
+    # Store results (AI output) in this list.
+    results = []
+
+    def run(query: str):
+        print(f"Task:\n{query}")
+        results.append(
+            result := agent.invoke(
+                {"messages": [HumanMessage(query)]},
+                {
+                    "configurable": {
+                        "thread_id": "ursa",
+                    },
+                    "recursion_limit": 50,
+                },
+            )
+        )
+        return result
+
     run(query_1)
 
     for result in results:

From a8f4594f12614adc0fc944a8def06ac8034201e3 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 22 Dec 2025 17:27:40 -0700
Subject: [PATCH 24/36] dynamic llm in multiagent test

---
 tests/agents/test_multiagent/test_multiagent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index c86348ba..62bf08d4 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -25,7 +25,7 @@
     )
 else:
     # Use openai
-    llm = init_chat_model("openai:gpt-5.2")
+    llm = init_chat_model(os.getenv("URSA_TEST_LLM", "openai:gpt-5.2"))
 
 
 def generate_data(data_path: Path):

From d37333db898ff0112f6fafbf972669d9e5769e23 Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mikegros@lanl.gov>
Date: Wed, 24 Dec 2025 21:19:49 -0700
Subject: [PATCH 25/36] Update test_multiagent.py

---
 tests/agents/test_multiagent/test_multiagent.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index 62bf08d4..3bb75a70 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -12,20 +12,9 @@
 
 from ursa.experimental.agents.multiagent import Ursa
 
-use_aiportal = False
-
-if use_aiportal:
-    llm = init_chat_model(
-        model=os.environ["CLAUDE"],
-        base_url=os.environ["AIPORTAL_API_URL"],
-        api_key=os.environ["AIPORTAL_API_KEY"],
-        model_provider="openai",
-        model_kwargs={"extra_headers": {"disable_fallbacks": "true"}},
-        http_client=httpx.Client(verify=False),
-    )
-else:
-    # Use openai
-    llm = init_chat_model(os.getenv("URSA_TEST_LLM", "openai:gpt-5.2"))
+
+# Use openai for the test on github
+llm = init_chat_model(os.getenv("URSA_TEST_LLM", "openai:gpt-5.2"))
 
 
 def generate_data(data_path: Path):

From ad8b21f6a915f693a3c94d5a694d24057ebc524f Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mikegros@lanl.gov>
Date: Wed, 24 Dec 2025 21:21:31 -0700
Subject: [PATCH 26/36] Update test_multiagent.py

---
 tests/agents/test_multiagent/test_multiagent.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index 3bb75a70..7b35dcbd 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -5,7 +5,6 @@
 import os
 from pathlib import Path
 
-import httpx
 from langchain.chat_models import init_chat_model
 from langchain.messages import HumanMessage
 from langgraph.checkpoint.memory import InMemorySaver

From ffc2ebb96ac240fd13ddf8f5c805b6186cec5efe Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mike.grosskopf@gmail.com>
Date: Wed, 24 Dec 2025 21:23:28 -0700
Subject: [PATCH 27/36] Small formatting update.

---
 tests/agents/test_multiagent/test_multiagent.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index 7b35dcbd..f4f7efa3 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -1,7 +1,6 @@
 # NOTE: This will be helpful for prompting.
 # https://cookbook.openai.com/examples/gpt-5/gpt-5_prompting_guide
 
-
 import os
 from pathlib import Path
 

From f30b7e5811b5a54349349ec8e6ed586c5341b418 Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mike.grosskopf@gmail.com>
Date: Wed, 24 Dec 2025 21:25:53 -0700
Subject: [PATCH 28/36] Small formatting update.

---
 tests/agents/test_multiagent/test_multiagent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index f4f7efa3..d9ed9c43 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -1,5 +1,5 @@
 # NOTE: This will be helpful for prompting.
-# https://cookbook.openai.com/examples/gpt-5/gpt-5_prompting_guide
+#     https://cookbook.openai.com/examples/gpt-5/gpt-5_prompting_guide
 
 import os
 from pathlib import Path

From 49dc312ea378a8cdd9485c96ca4b2c054c560247 Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mikegros@lanl.gov>
Date: Thu, 25 Dec 2025 12:18:12 -0700
Subject: [PATCH 29/36] Formatting

---
 tests/agents/test_multiagent/test_multiagent.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
index d9ed9c43..acbfeb3f 100644
--- a/tests/agents/test_multiagent/test_multiagent.py
+++ b/tests/agents/test_multiagent/test_multiagent.py
@@ -10,7 +10,6 @@
 
 from ursa.experimental.agents.multiagent import Ursa
 
-
 # Use openai for the test on github
 llm = init_chat_model(os.getenv("URSA_TEST_LLM", "openai:gpt-5.2"))
 

From d00f2ba09147795d28c90b037f84e8bdc91410d2 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 5 Jan 2026 16:42:28 -0700
Subject: [PATCH 30/36] default extra_tools to None

---
 src/ursa/experimental/agents/multiagent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 25a34326..d89e7907 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -145,7 +145,7 @@ class Ursa:
     def __init__(
         self,
         llm: BaseChatModel,
-        extra_tools: list = [],
+        extra_tools: Optional[list] = None,
         workspace: Path = Path("ursa-workspace"),
         checkpointer: Optional[BaseCheckpointSaver] = None,
         thread_id: str = "ursa",
@@ -153,7 +153,7 @@ def __init__(
         system_prompt: str = system_prompt,
     ):
         self.llm = llm
-        self.extra_tools = extra_tools
+        self.extra_tools = extra_tools or []
         self.workspace = workspace
         self.checkpointer = checkpointer
         self.thread_id = thread_id

From d0b4f7e7864c34706326155a121d6f2d754f6568 Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 5 Jan 2026 16:46:19 -0700
Subject: [PATCH 31/36] change default workspace

---
 .gitignore                                 | 1 -
 src/ursa/experimental/agents/multiagent.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index c43b5c40..a7389580 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,4 +32,3 @@ arxiv_papers
 ursa_workspace/
 .vscode/settings.json
 scratch/
-ursa-workspace/
diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index d89e7907..e2f8eada 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -146,7 +146,7 @@ def __init__(
         self,
         llm: BaseChatModel,
         extra_tools: Optional[list] = None,
-        workspace: Path = Path("ursa-workspace"),
+        workspace: Path = Path("ursa_workspace"),
         checkpointer: Optional[BaseCheckpointSaver] = None,
         thread_id: str = "ursa",
         max_reflection_steps: int = 1,

From 8c15502a8b719b5c51003fa68f9ceafdfe7693ab Mon Sep 17 00:00:00 2001
From: Arthur Lui <alui@lanl.gov>
Date: Mon, 5 Jan 2026 16:46:49 -0700
Subject: [PATCH 32/36] add space

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a7389580..3093dabc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,4 @@ arxiv_papers
 ursa_workspace/
 .vscode/settings.json
 scratch/
+

From 8fbcc44fcd8e5fc54a7dc691031bd6907a06d708 Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mikegros@lanl.gov>
Date: Fri, 9 Jan 2026 10:22:35 -0700
Subject: [PATCH 33/36] Fix to address failed test

Alex's recent refactor got rid of the _action method and made invoke a method of each agent directly.
Removed _action and this should pass the other CI tests.
Co-authored-by: lui-arthur
---
 src/ursa/experimental/agents/multiagent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index e2f8eada..1c24368e 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -46,7 +46,7 @@ def tag(tag_name: str, content: str):
 # NOTE: Resources
 # https://docs.langchain.com/oss/python/langchain/multi-agent#where-to-customize
 def make_execute_plan_tool(llm: BaseChatModel, workspace: Path):
-    execution_agent = ExecutionAgent(llm)._action
+    execution_agent = ExecutionAgent(llm)
 
     @tool(
         "execute_plan_tool",
@@ -125,7 +125,7 @@ def call_agent(query: str):
 
 
 def make_execution_tool(llm: BaseChatModel, workspace: Path):
-    execution_agent = ExecutionAgent(llm)._action
+    execution_agent = ExecutionAgent(llm)
 
     @tool(
         "execution_agent",

From c8de7b24705feb11454b98338b84a682f41a5f01 Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mikegros@lanl.gov>
Date: Fri, 9 Jan 2026 10:51:44 -0700
Subject: [PATCH 34/36] Missed one _action

There was one I missed on the planning agent.
Co-authored-by: lui-arthur
---
 src/ursa/experimental/agents/multiagent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 1c24368e..7f41a3c6 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -103,7 +103,7 @@ def execute_plan(plan: str):
 def make_planning_tool(llm: BaseChatModel, max_reflection_steps: int):
     planning_agent = PlanningAgent(
         llm, max_reflection_steps=max_reflection_steps
-    )._action
+    )
 
     @tool(
         "planning_agent",

From 52b6854bdfe966f11af280b354f16becfb79aa9f Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mike.grosskopf@gmail.com>
Date: Wed, 21 Jan 2026 00:07:42 -0700
Subject: [PATCH 35/36] Small update toward bringing up to date with other PRs.
 I will do a little more updating soon.

---
 src/ursa/experimental/agents/multiagent.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 7f41a3c6..014905f1 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -114,10 +114,18 @@ def call_agent(query: str):
             "messages": [HumanMessage(query)],
             "reflection_steps": max_reflection_steps,
         })
-        # return result["messages"][-1].content
-        plan_steps = [{"task": query}] + result["plan_steps"]
+        plan_steps = [{"task": query}] + [
+            {
+                "name": plan_step.name,
+                "description": plan_step.description,
+                "expected_outputs": plan_step.expected_outputs,
+                "success_criteria": plan_step.success_criteria,
+                "requires_code": plan_step.requires_code,
+            }
+            for plan_step in result["plan"].steps
+        ]
+
         plan = f"<PLAN>\n{json.dumps(plan_steps)}\n</PLAN>"
-        # print(json.dumps(plan_steps, indent=4))
         print(yaml.dump(plan_steps))
         return plan
 

From d1f393ee8720043eb872e0d594013dd6b8cf4f1e Mon Sep 17 00:00:00 2001
From: Mike Grosskopf <mikegros@lanl.gov>
Date: Wed, 21 Jan 2026 10:54:35 -0700
Subject: [PATCH 36/36] Small updates

- Passing checkpointer to the subagents so that they can keep a history.
- Fixed a JSON Decode Error that could happen on the LLM output.
    - Removed some potential bad character passing to json.loads
---
 src/ursa/experimental/agents/multiagent.py | 84 ++++++++++++++++------
 1 file changed, 64 insertions(+), 20 deletions(-)

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
index 014905f1..dd0b9da0 100644
--- a/src/ursa/experimental/agents/multiagent.py
+++ b/src/ursa/experimental/agents/multiagent.py
@@ -1,4 +1,5 @@
 import json
+import re
 from pathlib import Path
 from typing import Optional
 
@@ -45,8 +46,18 @@ def tag(tag_name: str, content: str):
 
 # NOTE: Resources
 # https://docs.langchain.com/oss/python/langchain/multi-agent#where-to-customize
-def make_execute_plan_tool(llm: BaseChatModel, workspace: Path):
-    execution_agent = ExecutionAgent(llm)
+def make_execute_plan_tool(
+    llm: BaseChatModel,
+    workspace: Path,
+    thread_id: str,
+    checkpointer: Checkpointer,
+):
+    execution_agent = ExecutionAgent(
+        llm,
+        workspace=workspace,
+        checkpointer=checkpointer,
+        thread_id=thread_id + "_plan_executor",
+    )
 
     @tool(
         "execute_plan_tool",
@@ -59,9 +70,17 @@ def execute_plan(plan: str):
         if plan.startswith("<PLAN>") and plan.endswith("</PLAN>"):
             summaries = []
 
-            task_and_plan_steps = json.loads(
+            plan_string = (
                 plan.replace("<PLAN>", "").replace("</PLAN>", "").strip()
             )
+            # Slight format cleaning.
+            #     Remove control characters except \t, \n, \r
+            #     Some LLMs respond with invalid control characters
+            plan_string = re.sub(
+                r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]", "", plan_string
+            )
+            task_and_plan_steps = json.loads(plan_string)
+
             task = task_and_plan_steps[0]["task"]
             plan_steps = task_and_plan_steps[1:]
             for step in plan_steps:
@@ -82,11 +101,8 @@ def execute_plan(plan: str):
                 step_prompt += tag("NEXT_STEP", yaml.dump(step).strip())
                 print(step_prompt)
 
-                result = execution_agent.invoke({
-                    "messages": [HumanMessage(step_prompt)],
-                    "workspace": str(workspace),
-                })
-                last_step_summary = result["messages"][-1].content
+                result = execution_agent.invoke(step_prompt)
+                last_step_summary = result["messages"][-1].text
                 summaries.append(last_step_summary)
             return "Grand summary of plan execution:\n\n" + "\n\n".join(
                 summaries
@@ -100,9 +116,17 @@ def execute_plan(plan: str):
     return execute_plan
 
 
-def make_planning_tool(llm: BaseChatModel, max_reflection_steps: int):
+def make_planning_tool(
+    llm: BaseChatModel,
+    max_reflection_steps: int,
+    thread_id: str,
+    checkpointer: Checkpointer,
+):
     planning_agent = PlanningAgent(
-        llm, max_reflection_steps=max_reflection_steps
+        llm,
+        checkpointer=checkpointer,
+        thread_id=thread_id + "_planner",
+        max_reflection_steps=max_reflection_steps,
     )
 
     @tool(
@@ -132,19 +156,26 @@ def call_agent(query: str):
     return call_agent
 
 
-def make_execution_tool(llm: BaseChatModel, workspace: Path):
-    execution_agent = ExecutionAgent(llm)
+def make_execution_tool(
+    llm: BaseChatModel,
+    workspace: Path,
+    thread_id: str,
+    checkpointer: Checkpointer,
+):
+    execution_agent = ExecutionAgent(
+        llm,
+        workspace=workspace,
+        checkpointer=checkpointer,
+        thread_id=thread_id + "_executor",
+    )
 
     @tool(
         "execution_agent",
         description="Read and edit scripts/code, and execute arbitrary commands on command line.",
     )
     def call_agent(query: str):
-        result = execution_agent.invoke({
-            "messages": [HumanMessage(query)],
-            "workspace": str(workspace),
-        })
-        return result["messages"][-1].content
+        result = execution_agent.invoke(query)
+        return result["messages"][-1].text
 
     return call_agent
 
@@ -177,11 +208,24 @@ def create(self, **kwargs):
         kwargs: for `create_agent`
         """
         self.subagents = [
-            make_execution_tool(llm=self.llm, workspace=self.workspace),
+            make_execution_tool(
+                llm=self.llm,
+                workspace=self.workspace,
+                thread_id=self.thread_id,
+                checkpointer=self.checkpointer,
+            ),
             make_planning_tool(
-                llm=self.llm, max_reflection_steps=self.max_reflection_steps
+                llm=self.llm,
+                max_reflection_steps=self.max_reflection_steps,
+                thread_id=self.thread_id,
+                checkpointer=self.checkpointer,
+            ),
+            make_execute_plan_tool(
+                llm=self.llm,
+                workspace=self.workspace,
+                thread_id=self.thread_id,
+                checkpointer=self.checkpointer,
             ),
-            make_execute_plan_tool(llm=self.llm, workspace=self.workspace),
         ]
         self.tools = self.subagents + self.extra_tools
         return create_agent(