codegen-sh · codegen-sh · May 31, 2025
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,14 +1,81 @@
 # Grainchain Benchmarks
 
-This directory contains benchmarking infrastructure for testing and comparing different sandbox providers in the Grainchain project.
+This directory contains benchmarking infrastructure for testing and comparing different sandbox providers in the Grainchain project, with support for multiple repositories and programming languages.
 
 ## Overview
 
-The benchmark suite evaluates sandbox providers across multiple scenarios:
+The benchmark suite evaluates sandbox providers across multiple scenarios and repositories:
+- **Multi-Repository Support**: Test TypeScript, Python, and other language repositories
 - **Basic Commands**: Simple shell commands and system information
 - **Python Execution**: Python script execution and package management
 - **File Operations**: File creation, reading, writing, and manipulation
 - **Computational Tasks**: CPU-intensive operations and mathematical computations
+- **Per-Repository Output**: Generate separate results for each tested repository
+
+## Supported Repositories
+
+### TypeScript Repositories
+- **outline**: Knowledge base application (default)
+
+### Python Repositories
+- **requests**: Popular HTTP library
+- **fastapi**: Modern web framework
+
+## Multi-Repository Benchmarking
+
+### Quick Start
+
+Run benchmarks for all configured repositories:
+```bash
+# Using grainchain (recommended)
+source .venv/bin/activate
+python benchmarks/scripts/grainchain_multi_repo_benchmark.py
+
+# Using Docker (requires docker installation)
+python benchmarks/scripts/multi_repo_benchmark_runner.py
+```
+
+Run benchmark for a specific repository:
+```bash
+# Using grainchain
+source .venv/bin/activate
+python benchmarks/scripts/grainchain_multi_repo_benchmark.py --repo requests
+
+# Using Docker
+python benchmarks/scripts/multi_repo_benchmark_runner.py --repo requests
+```
+
+Test multi-repo setup with E2B:
+```bash
+source .venv/bin/activate
+python test_multi_repo_dockerfile.py
+```
+
+Test specific repository setup:
+```bash
+source .venv/bin/activate
+python test_multi_repo_dockerfile.py --repo fastapi
+```
+
+### Configuration
+
+Repository configurations are stored in `benchmarks/configs/`:
+- `outline.json` - TypeScript/Node.js repository
+- `requests.json` - Python HTTP library
+- `fastapi.json` - Python web framework
+
+Each configuration includes:
+- Repository URL and branch
+- Language and package manager
+- Install and test commands
+- Trivial changes for snapshot testing
+- Metrics collection settings
+
+### Results
+
+Results are generated per repository in `benchmarks/results/`:
+- `multi_benchmark_results_TIMESTAMP.json` - Overall results
+- `{repo_name}_benchmark_TIMESTAMP.json` - Per-repository results
 
 ## Supported Providers
 

diff --git a/benchmarks/configs/fastapi.json b/benchmarks/configs/fastapi.json
@@ -0,0 +1,38 @@
+{
+  "repo_name": "fastapi",
+  "repo_url": "https://github.com/tiangolo/fastapi.git",
+  "repo_branch": "master",
+  "language": "python",
+  "package_manager": "pip",
+  "install_command": "pip install -e .",
+  "test_command": "python -m pytest tests/ -x",
+  "base_image": "ghcr.io/openai/codex-universal:latest",
+  "container_name": "fastapi-benchmark",
+  "workspace_path": "/workspace",
+  "python_version": "3.12",
+  "benchmark_iterations": 3,
+  "trivial_changes": [
+    {
+      "type": "comment",
+      "file": "README.md",
+      "content": "# Benchmark test comment"
+    },
+    {
+      "type": "whitespace",
+      "file": "pyproject.toml",
+      "content": "\n"
+    },
+    {
+      "type": "log",
+      "file": "fastapi/__init__.py",
+      "content": "# Benchmark test log"
+    }
+  ],
+  "metrics": {
+    "collect_build_time": true,
+    "collect_memory_usage": true,
+    "collect_filesystem_stats": true,
+    "collect_network_stats": true,
+    "run_tests": true
+  }
+}
diff --git a/benchmarks/configs/outline.json b/benchmarks/configs/outline.json
@@ -0,0 +1,39 @@
+{
+  "repo_name": "outline",
+  "repo_url": "https://github.com/outline/outline.git",
+  "repo_branch": "main",
+  "language": "typescript",
+  "package_manager": "yarn",
+  "install_command": "yarn install --frozen-lockfile",
+  "test_command": "yarn test",
+  "base_image": "ghcr.io/openai/codex-universal:latest",
+  "container_name": "outline-benchmark",
+  "workspace_path": "/workspace",
+  "node_version": "20",
+  "python_version": "3.12",
+  "benchmark_iterations": 3,
+  "trivial_changes": [
+    {
+      "type": "comment",
+      "file": "README.md",
+      "content": "# Benchmark test comment"
+    },
+    {
+      "type": "whitespace",
+      "file": "package.json",
+      "content": "\n"
+    },
+    {
+      "type": "log",
+      "file": "app/index.js",
+      "content": "console.log('benchmark test');"
+    }
+  ],
+  "metrics": {
+    "collect_build_time": true,
+    "collect_memory_usage": true,
+    "collect_filesystem_stats": true,
+    "collect_network_stats": true,
+    "run_tests": true
+  }
+}
diff --git a/benchmarks/configs/requests.json b/benchmarks/configs/requests.json
@@ -0,0 +1,38 @@
+{
+  "repo_name": "requests",
+  "repo_url": "https://github.com/psf/requests.git",
+  "repo_branch": "main",
+  "language": "python",
+  "package_manager": "pip",
+  "install_command": "pip install -e .",
+  "test_command": "python -m pytest tests/ -x",
+  "base_image": "ghcr.io/openai/codex-universal:latest",
+  "container_name": "requests-benchmark",
+  "workspace_path": "/workspace",
+  "python_version": "3.12",
+  "benchmark_iterations": 3,
+  "trivial_changes": [
+    {
+      "type": "comment",
+      "file": "README.md",
+      "content": "# Benchmark test comment"
+    },
+    {
+      "type": "whitespace",
+      "file": "setup.py",
+      "content": "\n"
+    },
+    {
+      "type": "log",
+      "file": "src/requests/__init__.py",
+      "content": "# Benchmark test log"
+    }
+  ],
+  "metrics": {
+    "collect_build_time": true,
+    "collect_memory_usage": true,
+    "collect_filesystem_stats": true,
+    "collect_network_stats": true,
+    "run_tests": true
+  }
+}