lanl · mikegros · Feb 2, 2026 · Dec 4, 2025 · Dec 5, 2025 · Dec 6, 2025
diff --git a/justfile b/justfile
@@ -100,3 +100,7 @@ shell:
 pygrep pattern:
     conda run --live-stream -n base watch \
         grep --exclude-dir=__pycache__ --exclude-dir=.venv -r '{{ pattern }}'
+
+[no-cd]
+python:
+    uv run ipython --no-autoindent
diff --git a/src/ursa/agents/execution_agent.py b/src/ursa/agents/execution_agent.py
@@ -79,7 +79,7 @@ class ExecutionState(TypedDict):
     """TypedDict representing the execution agent's mutable run state used by nodes.
 
     Fields:
-    - messages: list of messages (System/Human/AI/Tool) with add_messages metadata.
+    - messages: list of messages (System/Human/AI/Tool).
     - current_progress: short status string describing agent progress.
     - code_files: list of filenames created or edited in the workspace.
     - workspace: path to the working directory where files and commands run.

diff --git a/src/ursa/experimental/agents/multiagent.py b/src/ursa/experimental/agents/multiagent.py
@@ -0,0 +1,237 @@
+import json
+import re
+from pathlib import Path
+from typing import Optional
+
+import yaml
+from langchain.agents import create_agent
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+from langchain.tools import tool
+from langgraph.checkpoint.base import BaseCheckpointSaver
+
+from ursa.agents import ExecutionAgent, PlanningAgent
+from ursa.util import Checkpointer
+
+system_prompt = """\
+You are a data scientist with multiple tools.
+
+These tools are available to you:
+
+* planning_agent
+  * Use this tool whenever you are asked to plan out tasks.
+  * In each step of your plan, if code needs to be generated, please explicitly
+    state in the step that code needs to be written and executed.
+
+* execution_agent
+  * Use this tool **whenever** you are asked to write/edit code or run
+    arbitrary commands from the command line.
+
+* execute_plan_tool
+  * Use this tool if you are asked to execute a plan that starts with <PLAN>
+    and ends with </PLAN>.
+  * Do not use this tool if the <PLAN></PLAN> tags are not present in the
+    instruction!
+
+Note that this project is managed by `uv. So, if you need to execute python
+code, you MUST run `uv run path/to/file.py`. DO NOT run `python
+/path/to/file.py` or `python3 /path/to/file.py`.
+"""
+
+
+def tag(tag_name: str, content: str):
+    """Wrap content in XML tag"""
+    return f"\n<{tag_name}>\n{content}\n</{tag_name}>\n\n"
+
+
+# NOTE: Resources
+# https://docs.langchain.com/oss/python/langchain/multi-agent#where-to-customize
+def make_execute_plan_tool(
+    llm: BaseChatModel,
+    workspace: Path,
+    thread_id: str,
+    checkpointer: Checkpointer,
+):
+    execution_agent = ExecutionAgent(
+        llm,
+        workspace=workspace,
+        checkpointer=checkpointer,
+        thread_id=thread_id + "_plan_executor",
+    )
+
+    @tool(
+        "execute_plan_tool",
+        description="Execute a plan from the planning agent tool.",
+    )
+    def execute_plan(plan: str):
+        """Execute plan item by item."""
+
+        print("EXECUTING PLAN")
+        if plan.startswith("<PLAN>") and plan.endswith("</PLAN>"):
+            summaries = []
+
+            plan_string = (
+                plan.replace("<PLAN>", "").replace("</PLAN>", "").strip()
+            )
+            # Slight format cleaning.
+            #     Remove control characters except \t, \n, \r
+            #     Some LLMs respond with invalid control characters
+            plan_string = re.sub(
+                r"[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]", "", plan_string
+            )
+            task_and_plan_steps = json.loads(plan_string)
+
+            task = task_and_plan_steps[0]["task"]
+            plan_steps = task_and_plan_steps[1:]
+            for step in plan_steps:
+                step_prompt = (
+                    "You are contributing a solution of an overall plan. "
+                    "The overall plan, last step's summary, and next step are provided below. "
+                    "With the provided information, please carry out the next step. "
+                    "IF you write any code, be sure to execute the code to make "
+                    "sure it properly runs."
+                )
+                step_prompt += tag("OVERALL_PLAN", task)
+                if len(summaries) > 0:
+                    last_step_summary = summaries[-1]
+                    step_prompt += tag(
+                        "SUMMARY_OF_LAST_STEP", last_step_summary
+                    )
+
+                step_prompt += tag("NEXT_STEP", yaml.dump(step).strip())
+                print(step_prompt)
+
+                result = execution_agent.invoke(step_prompt)
+                last_step_summary = result["messages"][-1].text
+                summaries.append(last_step_summary)
+            return "Grand summary of plan execution:\n\n" + "\n\n".join(
+                summaries
+            )
+        else:
+            return (
+                "Could not use `execute_plan` tool execute plan "
+                "as plan does not start/end with <PLAN>/</PLAN>."
+            )
+
+    return execute_plan
+
+
+def make_planning_tool(
+    llm: BaseChatModel,
+    max_reflection_steps: int,
+    thread_id: str,
+    checkpointer: Checkpointer,
+):
+    planning_agent = PlanningAgent(
+        llm,
+        checkpointer=checkpointer,
+        thread_id=thread_id + "_planner",
+        max_reflection_steps=max_reflection_steps,
+    )
+
+    @tool(
+        "planning_agent",
+        description="Create plans for arbitrary tasks",
+    )
+    def call_agent(query: str):
+        result = planning_agent.invoke({
+            "messages": [HumanMessage(query)],
+            "reflection_steps": max_reflection_steps,
+        })
+        plan_steps = [{"task": query}] + [
+            {
+                "name": plan_step.name,
+                "description": plan_step.description,
+                "expected_outputs": plan_step.expected_outputs,
+                "success_criteria": plan_step.success_criteria,
+                "requires_code": plan_step.requires_code,
+            }
+            for plan_step in result["plan"].steps
+        ]
+
+        plan = f"<PLAN>\n{json.dumps(plan_steps)}\n</PLAN>"
+        print(yaml.dump(plan_steps))
+        return plan
+
+    return call_agent
+
+
+def make_execution_tool(
+    llm: BaseChatModel,
+    workspace: Path,
+    thread_id: str,
+    checkpointer: Checkpointer,
+):
+    execution_agent = ExecutionAgent(
+        llm,
+        workspace=workspace,
+        checkpointer=checkpointer,
+        thread_id=thread_id + "_executor",
+    )
+
+    @tool(
+        "execution_agent",
+        description="Read and edit scripts/code, and execute arbitrary commands on command line.",
+    )
+    def call_agent(query: str):
+        result = execution_agent.invoke(query)
+        return result["messages"][-1].text
+
+    return call_agent
+
+
+class Ursa:
+    def __init__(
+        self,
+        llm: BaseChatModel,
+        extra_tools: Optional[list] = None,
+        workspace: Path = Path("ursa_workspace"),
+        checkpointer: Optional[BaseCheckpointSaver] = None,
+        thread_id: str = "ursa",
+        max_reflection_steps: int = 1,
+        system_prompt: str = system_prompt,
+    ):
+        self.llm = llm
+        self.extra_tools = extra_tools or []
+        self.workspace = workspace
+        self.checkpointer = checkpointer
+        self.thread_id = thread_id
+        self.system_prompt = system_prompt
+        self.max_reflection_steps = max_reflection_steps
+        self.checkpointer = checkpointer or Checkpointer.from_workspace(
+            workspace
+        )
+
+    def create(self, **kwargs):
+        """Create agent.
+
+        kwargs: for `create_agent`
+        """
+        self.subagents = [
+            make_execution_tool(
+                llm=self.llm,
+                workspace=self.workspace,
+                thread_id=self.thread_id,
+                checkpointer=self.checkpointer,
+            ),
+            make_planning_tool(
+                llm=self.llm,
+                max_reflection_steps=self.max_reflection_steps,
+                thread_id=self.thread_id,
+                checkpointer=self.checkpointer,
+            ),
+            make_execute_plan_tool(
+                llm=self.llm,
+                workspace=self.workspace,
+                thread_id=self.thread_id,
+                checkpointer=self.checkpointer,
+            ),
+        ]
+        self.tools = self.subagents + self.extra_tools
+        return create_agent(
+            self.llm,
+            tools=self.tools,
+            system_prompt=self.system_prompt,
+            checkpointer=self.checkpointer,
+            **kwargs,
+        )
diff --git a/tests/agents/test_multiagent/.gitignore b/tests/agents/test_multiagent/.gitignore
@@ -0,0 +1 @@
+workspace/
diff --git a/tests/agents/test_multiagent/test_multiagent.py b/tests/agents/test_multiagent/test_multiagent.py
@@ -0,0 +1,101 @@
+# NOTE: This will be helpful for prompting.
+#     https://cookbook.openai.com/examples/gpt-5/gpt-5_prompting_guide
+
+import os
+from pathlib import Path
+
+from langchain.chat_models import init_chat_model
+from langchain.messages import HumanMessage
+from langgraph.checkpoint.memory import InMemorySaver
+
+from ursa.experimental.agents.multiagent import Ursa
+
+# Use openai for the test on github
+llm = init_chat_model(os.getenv("URSA_TEST_LLM", "openai:gpt-5.2"))
+
+
+def generate_data(data_path: Path):
+    import numpy as np
+    import pandas as pd
+
+    rng = np.random.default_rng(0)
+    x = rng.uniform(0, 1, 100)
+    y = rng.normal(2 * x + 1, 0.1)
+    pd.DataFrame(dict(x=x, y=y)).to_csv(data_path, index=False)
+
+
+# TODO: Need to make `uv run` a SAFE command.
+query_1 = """
+I have a file `data/data.csv`. 
+
+**First**, read the first few lines of the file to understand the format.
+Do this quickly; don't go overboard.
+
+**Then**, write a plan (with at most 4 steps) to perform simple linear
+regression on this data in python.  The plan MUST NOT include code; though it
+may include instruction to write code. The analysis should be **very minimal**
+and AS CONCISE AS POSSIBLE.  I care only about the coefficients (including an
+intercept). Do not provide other information or plots.
+
+**Then**, EXECUTE THE PLAN using execute_plan_tool. Write all code to
+`analysis.py`. DO NOT write anything to `data/`. Do not write any other
+files. I want a single file with the entire analysis.
+
+**Finally**, edit `analysis.py` to make it AS CONCISE AS POSSIBLE. Don't
+include code for assert, raising errors, exception handling, plots, etc. I want
+ONLY a very minimal script that reads the data and then prints the linear
+model's coefficients. Remember, I want A SINGLE FILE with the entire analysis
+(in `analysis.py`).
+"""
+
+# An alternate query to test.
+query_2 = """
+I have a file `data/data.csv`. 
+
+Please write a very minimal python script to perform linear regression on this
+data.  The analysis shoud be as concise as possible. I care only about the
+coefficients (including an intercept).  Do not provide other information or
+plots. Write the analysis to `analysis.py`. Run the code to ensure it works. 
+"""
+
+
+def test_multiagent():
+    # Generate data if not already present.
+    workspace = Path(__file__).parent / "workspace"
+    data_dir = workspace / "data"
+    data_csv = data_dir / "data.csv"
+    if not data_csv.exists():
+        data_dir.mkdir(exist_ok=True, parents=True)
+        generate_data(data_dir / "data.csv")
+
+    # Initialize agent.
+    agent = Ursa(
+        llm,
+        max_reflection_steps=0,
+        workspace=workspace,
+        checkpointer=InMemorySaver(),
+    ).create()
+
+    # Store results (AI output) in this list.
+    results = []
+
+    def run(query: str):
+        print(f"Task:\n{query}")
+        results.append(
+            result := agent.invoke(
+                {"messages": [HumanMessage(query)]},
+                {
+                    "configurable": {
+                        "thread_id": "ursa",
+                    },
+                    "recursion_limit": 50,
+                },
+            )
+        )
+        return result
+
+    run(query_1)
+
+    for result in results:
+        for msg in result["messages"]:
+            msg.pretty_print()