diff --git a/nemo_skills/inference/eval/swebench.py b/nemo_skills/inference/eval/swebench.py index 0fc7509ccc..f3610091e8 100644 --- a/nemo_skills/inference/eval/swebench.py +++ b/nemo_skills/inference/eval/swebench.py @@ -246,6 +246,10 @@ def __init__(self, cfg: SweBenchGenerationConfig): "mkdir -p /root/tmux && " "curl -Lf https://github.com/nelsonenzo/tmux-appimage/releases/download/3.5a/tmux.appimage -o /root/tmux/tmux && " "chmod 777 /root/tmux/tmux && " + # download jq + "mkdir -p /root/jq && " + "curl -Lf https://github.com/jqlang/jq/releases/download/jq-1.8.1/jq-linux-amd64 -o /root/jq/jq && " + "chmod 777 /root/jq/jq && " # clone the openhands repo "rm -rf /root/OpenHands && " f"git clone {self.cfg.agent_framework_repo} /root/OpenHands && " @@ -531,13 +535,16 @@ async def _run_openhands(self, data_point, api_base): " echo 'This is because OpenHands DELETES EVERYTHING in the /workspace folder if it exists.' && " " exit 1; " "fi && " - # copy installed repo, uv & tmux dirs from /root_mount + # copy installed repo, uv, tmux & jq dirs from /root_mount "cp -r /root_mount/OpenHands /root && " "cp -r /root_mount/uv /root && " "cp -r /root_mount/tmux /root && " + "cp -r /root_mount/jq /root && " "cd /root/OpenHands && " - # add poetry & tmux to PATH - "export PATH=/root/uv/tool-bin:/root/tmux:$PATH && " + # make soft links to poetry, tmux & jq in /usr/local/bin, so OpenHands can run them from the command line + "ln -sf /root/uv/tool-bin/poetry /usr/local/bin/poetry && " + "ln -sf /root/tmux/tmux /usr/local/bin/tmux && " + "ln -sf /root/jq/jq /usr/local/bin/jq && " # enable tmux appimage to run without fusermount # https://docs.appimage.org/user-guide/troubleshooting/fuse.html#extract-and-run-type-2-appimages "export APPIMAGE_EXTRACT_AND_RUN=1 && " diff --git a/nemo_skills/prompt/config/eval/swe-bench/swe-agent/multilingual.yaml b/nemo_skills/prompt/config/eval/swe-bench/swe-agent/multilingual.yaml new file mode 100644 index 0000000000..2b0e67de89 --- /dev/null +++ b/nemo_skills/prompt/config/eval/swe-bench/swe-agent/multilingual.yaml @@ -0,0 +1,78 @@ +# Based on the default config from the SWE-agent repo: +# https://github.com/SWE-agent/SWE-agent/blob/1375ec4fa69d300b432b9ca61d6b0e5d7259131c/config/default.yaml +# but mentions of Python are removed to make the prompt language-agnostic. + +# note that this doesn't use nemo-skills prompt logic and instead is passed directly to swe-agent + +agent: + templates: + system_template: |- + You are a helpful assistant that can interact with a computer to solve tasks. + instance_template: |- + + {{working_dir}} + + I've uploaded a code repository in the directory {{working_dir}}. Consider the following PR description: + + + {{problem_statement}} + + + Can you help me implement the necessary changes to the repository so that the requirements specified in the are met? + I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! + Your task is to make the minimal changes to non-tests files in the {{working_dir}} directory to ensure the is satisfied. + Follow these steps to resolve the issue: + 1. As a first step, it might be a good idea to find and read code relevant to the + 2. Create a script to reproduce the error and execute it using the bash tool, to confirm the error + 3. Edit the sourcecode of the repo to resolve the issue + 4. Rerun your reproduce script and confirm that the error is fixed! + 5. Think about edgecases and make sure your fix handles them as well + Your thinking should be thorough and so it's fine if it's very long. + next_step_template: |- + OBSERVATION: + {{observation}} + next_step_no_output_template: |- + Your command ran successfully and did not produce any output. + tools: + env_variables: + PAGER: cat + MANPAGER: cat + LESS: -R + PIP_PROGRESS_BAR: 'off' + TQDM_DISABLE: '1' + GIT_PAGER: cat + bundles: + - path: tools/registry + - path: tools/edit_anthropic + - path: tools/review_on_submit_m + registry_variables: + USE_FILEMAP: 'true' + SUBMIT_REVIEW_MESSAGES: + - | + Thank you for your work on this issue. Please carefully follow the steps below to help review your changes. + + 1. If you made any changes to your code after running the reproduction script, please run the reproduction script again. + If the reproduction script is failing, please revisit your changes and make sure they are correct. + If you have already removed your reproduction script, please ignore this step. + 2. Remove your reproduction script (if you haven't done so already). + 3. If you have modified any TEST files, please revert them to the state they had before you started fixing the issue. + You can do this with `git checkout -- /path/to/test/file`. Use below to find the files you need to revert. + 4. Run the submit command again to confirm. + + Here is a list of all of your changes: + + + {{diff}} + + enable_bash_tool: true + parse_function: + type: function_calling + history_processors: [] + model: + # The following parameters are overridden by Nemo-Skills: + # name, api_base, temperature, top_p, completion_kwargs, per_instance_call_limit. + # Specifying them here will have no effect! Use Nemo-Skills options instead. + per_instance_cost_limit: 0 + total_cost_limit: 0 + max_input_tokens: 0 + max_output_tokens: 0