Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ build/
dist/
wheels/
*.egg-info
.DS_Store

# Virtual environments
.venv
31 changes: 31 additions & 0 deletions result/aider_python.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
─────────────────────────────────────────────────── tmp.benchmarks/2025-08-03-16-28-23--Qwen ────────────────────────────────────────────────────
- dirname: 2025-08-03-16-28-23--Qwen
test_cases: 34
model: openai/Qwen/Qwen2.5-Coder-0.5B-Instruct
edit_format: whole
commit_hash: 5a65457-dirty
pass_rate_1: 0.0
pass_rate_2: 2.9
pass_num_1: 0
pass_num_2: 1
percent_cases_well_formed: 100.0
error_outputs: 9
num_malformed_responses: 0
num_with_malformed_responses: 0
user_asks: 29
lazy_comments: 0
syntax_errors: 0
indentation_errors: 0
exhausted_context_windows: 9
prompt_tokens: 485139
completion_tokens: 45402
test_timeouts: 0
total_tests: 225
command: aider --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct
date: 2025-08-03
versions: 0.85.3.dev
seconds_per_case: 53.6
total_cost: 0.0000

costs: $0.0000/test-case, $0.00 total, $0.00 projected
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
11 changes: 11 additions & 0 deletions result/eval.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2048/2048 [00:22<00:00, 91.33it/s]
============================================================
并发数 : 1024
请求数 : 2048
成功请求 : 2048
总输出 tokens : 225230
QPS (req/s) : 91.27
Token QPS : 10037.90 tokens/s
平均 TTFT : 6984.7 ms
平均 TPOT : 87.2 ms
============================================================
11 changes: 11 additions & 0 deletions src/aider_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export OPENAI_API_BASE=http://localhost:33696/v1
export OPENAI_API_KEY=sk-no-key-required
# ./benchmark/benchmark.py Qwen --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct --languages python --edit-format whole --threads 10 --exercises-dir polyglot-benchmark

./benchmark/benchmark.py Qwen \
--model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \
--languages python \
--edit-format whole \
--threads 8 \
--exercises-dir polyglot-benchmark
# --num-tests 10
98 changes: 98 additions & 0 deletions src/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import asyncio, aiohttp, argparse, time, json, sys, tiktoken
from typing import List, Tuple
from tqdm import tqdm

ENC = tiktoken.get_encoding("cl100k_base")

def build_payload(max_tokens: int = 128):
prompt = "Implement a Python function that reverses a linked list."
return {
"model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.3,
"stream": True
}

async def send_one(session, url, payload, stats: List):
start = time.perf_counter()
output_tokens = 0
try:
async with session.post(url, json=payload) as resp:
if resp.status != 200:
stats.append(("http_error", 0, 0, 0, 0))
return
ttft = None
async for line in resp.content:
if line.startswith(b"data: "):
data = line[6:]
if data.strip() == b"[DONE]":
break
try:
chunk = json.loads(data)
except:
continue
if "choices" in chunk and len(chunk["choices"]) > 0:
delta = chunk["choices"][0].get("delta", {})
if "content" in delta:
if ttft is None:
ttft = time.perf_counter() - start
output_tokens += 1
total = time.perf_counter() - start
stats.append(("ok", ttft or 0, total, output_tokens, 1))
except Exception as e:
stats.append(("exception", 0, 0, 0, 0))
print(e, file=sys.stderr)

async def main(args):
url = f"http://{args.host}:{args.port}/v1/chat/completions"
payload = build_payload(args.max_tokens)

stats: List[Tuple[str, float, float, int, int]] = []

async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(limit=args.concurrent)
) as session:
t0 = time.perf_counter()
tasks = [
asyncio.create_task(send_one(session, url, payload, stats))
for _ in range(args.requests)
]
for f in tqdm(asyncio.as_completed(tasks), total=args.requests):
await f
total_wall = time.perf_counter() - t0

ok = [s for s in stats if s[0] == "ok"]
total_out_tokens = sum(s[3] for s in ok)
total_success = len(ok)

if total_success == 0:
print("All requests failed.")
return

# 指标
qps = total_success / total_wall
token_qps = total_out_tokens / total_wall
avg_ttft = sum(s[1] for s in ok) / total_success
avg_tpot = (sum(s[2] for s in ok) - sum(s[1] for s in ok)) / total_out_tokens

print("="*60)
print(f"并发数 : {args.concurrent}")
print(f"请求数 : {args.requests}")
print(f"成功请求 : {total_success}")
print(f"总输出 tokens : {total_out_tokens}")
print(f"QPS (req/s) : {qps:.2f}")
print(f"Token QPS : {token_qps:.2f} tokens/s")
print(f"平均 TTFT : {avg_ttft*1000:.1f} ms")
print(f"平均 TPOT : {avg_tpot*1000:.1f} ms")
print("="*60)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", default="127.0.0.1")
parser.add_argument("--port", type=int, default=38468)
parser.add_argument("--concurrent", type=int, default=512)
parser.add_argument("--requests", type=int, default=512)
parser.add_argument("--max-tokens", type=int, default=128)
args = parser.parse_args()
asyncio.run(main(args))
132 changes: 132 additions & 0 deletions src/serving.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
'''
from sglang.test.test_utils import is_in_ci

if is_in_ci():
from patch import launch_server_cmd
else:
from sglang.utils import launch_server_cmd

from sglang.utils import wait_for_server, print_highlight, terminate_process

server_process, port = launch_server_cmd(
"python3 -m sglang.launch_server "
"--model-path Qwen/Qwen2.5-Coder-0.5B-Instruct "
"--host 0.0.0.0 "
"--mem-fraction-static 0.85 "
"--context-length 32768 "
"--max-total-tokens 196608 "
"--max-prefill-tokens 49152 "
"--max-running-requests 1024 "
"--attention-backend flashinfer "
"--trust-remote-code "
"--prefill-attention-backend flashinfer "
"--decode-attention-backend flashinfer "
"--chunked-prefill-size 4096 "
"--watchdog-timeout 600 "
"--enable-torch-compile"
)

wait_for_server(f"http://localhost:{port}")
print(f"Server started on http://localhost:{port}")

'''

from __future__ import annotations
import dataclasses
import subprocess
import time
import typing as T

def _import_launcher():
from sglang.test.test_utils import is_in_ci
if is_in_ci():
from patch import launch_server_cmd
else:
from sglang.utils import launch_server_cmd
return launch_server_cmd

@dataclasses.dataclass
class ServerConfig:
model_path: str = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
host: str = "0.0.0.0"
mem_fraction_static: float = 0.85
context_length: int = 32_768
max_total_tokens: int = 196_608
max_prefill_tokens: int = 49_152
max_running_requests: int = 1024
attention_backend: str = "flashinfer"
trust_remote_code: bool = True
prefill_attention_backend: str = "flashinfer"
decode_attention_backend: str = "flashinfer"
chunked_prefill_size: int = 4096
watchdog_timeout: int = 600
enable_torch_compile: bool = True

def to_cli_args(self) -> str:
flags = [
f"--model-path {self.model_path}",
f"--host {self.host}",
f"--mem-fraction-static {self.mem_fraction_static}",
f"--context-length {self.context_length}",
f"--max-total-tokens {self.max_total_tokens}",
f"--max-prefill-tokens {self.max_prefill_tokens}",
f"--max-running-requests {self.max_running_requests}",
f"--attention-backend {self.attention_backend}",
f"--prefill-attention-backend {self.prefill_attention_backend}",
f"--decode-attention-backend {self.decode_attention_backend}",
f"--chunked-prefill-size {self.chunked_prefill_size}",
f"--watchdog-timeout {self.watchdog_timeout}",
]
if self.trust_remote_code:
flags.append("--trust-remote-code")
if self.enable_torch_compile:
flags.append("--enable-torch-compile")
return " ".join(flags)

class LLMServer:
def __init__(self, cfg: ServerConfig) -> None:
self.cfg = cfg
self._proc: T.Optional[subprocess.Popen] = None
self._port: T.Optional[int] = None

def start(self) -> str:
launch_server_cmd = _import_launcher()
from sglang.utils import wait_for_server, print_highlight

cmd = f"python3 -m sglang.launch_server {self.cfg.to_cli_args()}"
self._proc, self._port = launch_server_cmd(cmd)
endpoint = f"http://localhost:{self._port}"

try:
wait_for_server(endpoint)
print_highlight(f"✅ Server ready at {endpoint}")
return endpoint
except Exception:
self.stop()
raise RuntimeError("Server failed to start.")

def stop(self) -> None:
if self._proc and self._proc.poll() is None:
from sglang.utils import terminate_process
terminate_process(self._proc)
self._proc.wait()
print("🛑 Server terminated.")
self._proc = None
self._port = None

def __enter__(self):
self.start()
return self

def __exit__(self, exc_type, exc, tb):
self.stop()

if __name__ == "__main__":
cfg = ServerConfig()
server = LLMServer(cfg)
try:
server.start()
while True:
time.sleep(1)
except KeyboardInterrupt:
server.stop()