Skip to content

[WIP] MCP code executor to execute LLM-generated code flexibly in CodeAct and ProgramOfThought #8467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

chenmoneygithub
Copy link
Collaborator

@chenmoneygithub chenmoneygithub commented Jun 26, 2025

Very dirty code now, please don't look at it😶‍🌫️

Both async and sync are supported. But to use optimize, users need to use the sync path:

import dspy

lm_4o_mini = dspy.LM("openai/gpt-4o-mini")
lm_4o = dspy.LM("openai/gpt-4o")
llama = dspy.LM("databricks/databricks-meta-llama-3-3-70b-instruct")
dspy.configure(
    lm=lm_4o,
    # adapter=dspy.JSONAdapter(),
)


from pydantic import BaseModel

from dspy.datasets import DataLoader

kwargs = dict(fields=("claim", "supporting_facts", "hpqa_id", "num_hops"), input_keys=("claim",))
hover = DataLoader().from_huggingface(dataset_name="hover-nlp/hover", split="train", trust_remote_code=True, **kwargs)

hpqa_ids = set()
filtered_hover = []
for x in hover:
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids:
        hpqa_ids.add(x["hpqa_id"])
        filtered_hover.append(
            dspy.Example(claim=x.claim, titles=list(set([y["key"] for y in x.supporting_facts]))).with_inputs("claim")
        )
hover = filtered_hover

trainset, devset, testset = hover[:100], hover[100:150], hover[650:]

example = trainset[0]

print("Claim:", example.claim)
print("Pages that must be retrieved:", example.titles)

DOCS = {}


class SearchInput(BaseModel):
    query: str


def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")(query, k=k)
    results = [x["text"] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results


def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]


def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]


instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)

tools = [dspy.Tool(search_wikipedia), dspy.Tool(lookup_wikipedia)]

codeact = dspy.CodeAct(signature, tools=tools, max_iters=5)

output = codeact(claim="David Gregory was born in 1625.")
print(output)

dspy.inspect_history(n=3)


def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0

    # If we're just doing inference, just measure the recall.
    return recall


evaluate = dspy.Evaluate(
    devset=devset[:100], metric=top5_recall, num_threads=16, display_progress=True, display_table=5
)


def safe_codeact(claim: str):
    try:
        return codeact(claim=claim)
    except Exception as e:
        print(e)
        return dspy.Prediction(titles=[])


evaluate(safe_codeact)


# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = dspy.BootstrapFewShot(
    metric=top5_recall,
    max_bootstrapped_demos=4,
    max_labeled_demos=4,
    max_rounds=3,
)

# Compile!
compiled_codeact = teleprompter.compile(codeact, trainset=trainset[:20])


def safe_compiled_codeact(claim: str):
    try:
        return compiled_codeact(claim=claim)
    except Exception:
        return dspy.Prediction(titles=[])


evaluate(safe_compiled_codeact)

compiled_codeact.save("codeact_native_tmp.json", save_program=False)

Performance of CodeAct on Hover dataset is not very good though:

ReAct + ChatAdapter CodeAct + ChatAdapter
gpt-4o 76.7% 53.3%
gpt-4.1-mini 81.3% 50.7%

@chenmoneygithub chenmoneygithub marked this pull request as draft June 26, 2025 23:29
@arthurcolle
Copy link

I would like to recommend using a newer cheaper model

gpt-4.1-nano is better for examples

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants