diff --git a/.gitignore b/.gitignore index 505a3b1..095468b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ build/ dist/ wheels/ *.egg-info +.DS_Store # Virtual environments .venv diff --git a/result/aider_python.log b/result/aider_python.log new file mode 100644 index 0000000..d6fdc81 --- /dev/null +++ b/result/aider_python.log @@ -0,0 +1,31 @@ +─────────────────────────────────────────────────── tmp.benchmarks/2025-08-03-16-28-23--Qwen ──────────────────────────────────────────────────── +- dirname: 2025-08-03-16-28-23--Qwen + test_cases: 34 + model: openai/Qwen/Qwen2.5-Coder-0.5B-Instruct + edit_format: whole + commit_hash: 5a65457-dirty + pass_rate_1: 0.0 + pass_rate_2: 2.9 + pass_num_1: 0 + pass_num_2: 1 + percent_cases_well_formed: 100.0 + error_outputs: 9 + num_malformed_responses: 0 + num_with_malformed_responses: 0 + user_asks: 29 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 9 + prompt_tokens: 485139 + completion_tokens: 45402 + test_timeouts: 0 + total_tests: 225 + command: aider --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct + date: 2025-08-03 + versions: 0.85.3.dev + seconds_per_case: 53.6 + total_cost: 0.0000 + +costs: $0.0000/test-case, $0.00 total, $0.00 projected +───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── \ No newline at end of file diff --git a/result/eval.log b/result/eval.log new file mode 100644 index 0000000..f1bc8e9 --- /dev/null +++ b/result/eval.log @@ -0,0 +1,11 @@ +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2048/2048 [00:22<00:00, 91.33it/s] +============================================================ +并发数 : 1024 +请求数 : 2048 +成功请求 : 2048 +总输出 tokens : 225230 +QPS (req/s) : 91.27 +Token QPS : 10037.90 tokens/s +平均 TTFT : 6984.7 ms +平均 TPOT : 87.2 ms +============================================================ \ No newline at end of file diff --git a/src/aider_benchmark.sh b/src/aider_benchmark.sh new file mode 100644 index 0000000..2393e21 --- /dev/null +++ b/src/aider_benchmark.sh @@ -0,0 +1,11 @@ +export OPENAI_API_BASE=http://localhost:33696/v1 +export OPENAI_API_KEY=sk-no-key-required +# ./benchmark/benchmark.py Qwen --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct --languages python --edit-format whole --threads 10 --exercises-dir polyglot-benchmark + +./benchmark/benchmark.py Qwen \ + --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \ + --languages python \ + --edit-format whole \ + --threads 8 \ + --exercises-dir polyglot-benchmark +# --num-tests 10 \ No newline at end of file diff --git a/src/benchmark.py b/src/benchmark.py new file mode 100644 index 0000000..98bebb3 --- /dev/null +++ b/src/benchmark.py @@ -0,0 +1,98 @@ +import asyncio, aiohttp, argparse, time, json, sys, tiktoken +from typing import List, Tuple +from tqdm import tqdm + +ENC = tiktoken.get_encoding("cl100k_base") + +def build_payload(max_tokens: int = 128): + prompt = "Implement a Python function that reverses a linked list." + return { + "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0.3, + "stream": True + } + +async def send_one(session, url, payload, stats: List): + start = time.perf_counter() + output_tokens = 0 + try: + async with session.post(url, json=payload) as resp: + if resp.status != 200: + stats.append(("http_error", 0, 0, 0, 0)) + return + ttft = None + async for line in resp.content: + if line.startswith(b"data: "): + data = line[6:] + if data.strip() == b"[DONE]": + break + try: + chunk = json.loads(data) + except: + continue + if "choices" in chunk and len(chunk["choices"]) > 0: + delta = chunk["choices"][0].get("delta", {}) + if "content" in delta: + if ttft is None: + ttft = time.perf_counter() - start + output_tokens += 1 + total = time.perf_counter() - start + stats.append(("ok", ttft or 0, total, output_tokens, 1)) + except Exception as e: + stats.append(("exception", 0, 0, 0, 0)) + print(e, file=sys.stderr) + +async def main(args): + url = f"http://{args.host}:{args.port}/v1/chat/completions" + payload = build_payload(args.max_tokens) + + stats: List[Tuple[str, float, float, int, int]] = [] + + async with aiohttp.ClientSession( + connector=aiohttp.TCPConnector(limit=args.concurrent) + ) as session: + t0 = time.perf_counter() + tasks = [ + asyncio.create_task(send_one(session, url, payload, stats)) + for _ in range(args.requests) + ] + for f in tqdm(asyncio.as_completed(tasks), total=args.requests): + await f + total_wall = time.perf_counter() - t0 + + ok = [s for s in stats if s[0] == "ok"] + total_out_tokens = sum(s[3] for s in ok) + total_success = len(ok) + + if total_success == 0: + print("All requests failed.") + return + + # 指标 + qps = total_success / total_wall + token_qps = total_out_tokens / total_wall + avg_ttft = sum(s[1] for s in ok) / total_success + avg_tpot = (sum(s[2] for s in ok) - sum(s[1] for s in ok)) / total_out_tokens + + print("="*60) + print(f"并发数 : {args.concurrent}") + print(f"请求数 : {args.requests}") + print(f"成功请求 : {total_success}") + print(f"总输出 tokens : {total_out_tokens}") + print(f"QPS (req/s) : {qps:.2f}") + print(f"Token QPS : {token_qps:.2f} tokens/s") + print(f"平均 TTFT : {avg_ttft*1000:.1f} ms") + print(f"平均 TPOT : {avg_tpot*1000:.1f} ms") + print("="*60) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=38468) + parser.add_argument("--concurrent", type=int, default=512) + parser.add_argument("--requests", type=int, default=512) + parser.add_argument("--max-tokens", type=int, default=128) + args = parser.parse_args() + asyncio.run(main(args)) \ No newline at end of file diff --git a/src/serving.py b/src/serving.py new file mode 100644 index 0000000..5fa3900 --- /dev/null +++ b/src/serving.py @@ -0,0 +1,132 @@ +''' +from sglang.test.test_utils import is_in_ci + +if is_in_ci(): + from patch import launch_server_cmd +else: + from sglang.utils import launch_server_cmd + +from sglang.utils import wait_for_server, print_highlight, terminate_process + +server_process, port = launch_server_cmd( + "python3 -m sglang.launch_server " + "--model-path Qwen/Qwen2.5-Coder-0.5B-Instruct " + "--host 0.0.0.0 " + "--mem-fraction-static 0.85 " + "--context-length 32768 " + "--max-total-tokens 196608 " + "--max-prefill-tokens 49152 " + "--max-running-requests 1024 " + "--attention-backend flashinfer " + "--trust-remote-code " + "--prefill-attention-backend flashinfer " + "--decode-attention-backend flashinfer " + "--chunked-prefill-size 4096 " + "--watchdog-timeout 600 " + "--enable-torch-compile" +) + +wait_for_server(f"http://localhost:{port}") +print(f"Server started on http://localhost:{port}") + +''' + +from __future__ import annotations +import dataclasses +import subprocess +import time +import typing as T + +def _import_launcher(): + from sglang.test.test_utils import is_in_ci + if is_in_ci(): + from patch import launch_server_cmd + else: + from sglang.utils import launch_server_cmd + return launch_server_cmd + +@dataclasses.dataclass +class ServerConfig: + model_path: str = "Qwen/Qwen2.5-Coder-0.5B-Instruct" + host: str = "0.0.0.0" + mem_fraction_static: float = 0.85 + context_length: int = 32_768 + max_total_tokens: int = 196_608 + max_prefill_tokens: int = 49_152 + max_running_requests: int = 1024 + attention_backend: str = "flashinfer" + trust_remote_code: bool = True + prefill_attention_backend: str = "flashinfer" + decode_attention_backend: str = "flashinfer" + chunked_prefill_size: int = 4096 + watchdog_timeout: int = 600 + enable_torch_compile: bool = True + + def to_cli_args(self) -> str: + flags = [ + f"--model-path {self.model_path}", + f"--host {self.host}", + f"--mem-fraction-static {self.mem_fraction_static}", + f"--context-length {self.context_length}", + f"--max-total-tokens {self.max_total_tokens}", + f"--max-prefill-tokens {self.max_prefill_tokens}", + f"--max-running-requests {self.max_running_requests}", + f"--attention-backend {self.attention_backend}", + f"--prefill-attention-backend {self.prefill_attention_backend}", + f"--decode-attention-backend {self.decode_attention_backend}", + f"--chunked-prefill-size {self.chunked_prefill_size}", + f"--watchdog-timeout {self.watchdog_timeout}", + ] + if self.trust_remote_code: + flags.append("--trust-remote-code") + if self.enable_torch_compile: + flags.append("--enable-torch-compile") + return " ".join(flags) + +class LLMServer: + def __init__(self, cfg: ServerConfig) -> None: + self.cfg = cfg + self._proc: T.Optional[subprocess.Popen] = None + self._port: T.Optional[int] = None + + def start(self) -> str: + launch_server_cmd = _import_launcher() + from sglang.utils import wait_for_server, print_highlight + + cmd = f"python3 -m sglang.launch_server {self.cfg.to_cli_args()}" + self._proc, self._port = launch_server_cmd(cmd) + endpoint = f"http://localhost:{self._port}" + + try: + wait_for_server(endpoint) + print_highlight(f"✅ Server ready at {endpoint}") + return endpoint + except Exception: + self.stop() + raise RuntimeError("Server failed to start.") + + def stop(self) -> None: + if self._proc and self._proc.poll() is None: + from sglang.utils import terminate_process + terminate_process(self._proc) + self._proc.wait() + print("🛑 Server terminated.") + self._proc = None + self._port = None + + def __enter__(self): + self.start() + return self + + def __exit__(self, exc_type, exc, tb): + self.stop() + +if __name__ == "__main__": + cfg = ServerConfig() + server = LLMServer(cfg) + try: + server.start() + while True: + time.sleep(1) + except KeyboardInterrupt: + server.stop() \ No newline at end of file