maoquan-ms · 3116218257 · Aug 1, 2025 · Aug 3, 2025 · Aug 3, 2025 · Aug 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ build/
 dist/
 wheels/
 *.egg-info
+.DS_Store
 
 # Virtual environments
 .venv
diff --git a/result/aider_python.log b/result/aider_python.log
@@ -0,0 +1,31 @@
+─────────────────────────────────────────────────── tmp.benchmarks/2025-08-03-16-28-23--Qwen ────────────────────────────────────────────────────
+- dirname: 2025-08-03-16-28-23--Qwen
+  test_cases: 34
+  model: openai/Qwen/Qwen2.5-Coder-0.5B-Instruct
+  edit_format: whole
+  commit_hash: 5a65457-dirty
+  pass_rate_1: 0.0
+  pass_rate_2: 2.9
+  pass_num_1: 0
+  pass_num_2: 1
+  percent_cases_well_formed: 100.0
+  error_outputs: 9
+  num_malformed_responses: 0
+  num_with_malformed_responses: 0
+  user_asks: 29
+  lazy_comments: 0
+  syntax_errors: 0
+  indentation_errors: 0
+  exhausted_context_windows: 9
+  prompt_tokens: 485139
+  completion_tokens: 45402
+  test_timeouts: 0
+  total_tests: 225
+  command: aider --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct
+  date: 2025-08-03
+  versions: 0.85.3.dev
+  seconds_per_case: 53.6
+  total_cost: 0.0000
+
+costs: $0.0000/test-case, $0.00 total, $0.00 projected
+─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
diff --git a/result/eval.log b/result/eval.log
@@ -0,0 +1,11 @@
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2048/2048 [00:22<00:00, 91.33it/s]
+============================================================
+并发数        : 1024
+请求数        : 2048
+成功请求      : 2048
+总输出 tokens : 225230
+QPS (req/s)   : 91.27
+Token QPS     : 10037.90 tokens/s
+平均 TTFT     : 6984.7 ms
+平均 TPOT     : 87.2 ms
+============================================================
diff --git a/src/aider_benchmark.sh b/src/aider_benchmark.sh
@@ -0,0 +1,11 @@
+export OPENAI_API_BASE=http://localhost:33696/v1
+export OPENAI_API_KEY=sk-no-key-required
+# ./benchmark/benchmark.py Qwen --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct --languages python --edit-format whole --threads 10 --exercises-dir polyglot-benchmark
+
+./benchmark/benchmark.py Qwen \
+  --model openai/Qwen/Qwen2.5-Coder-0.5B-Instruct \
+  --languages python \
+  --edit-format whole \
+  --threads 8 \
+  --exercises-dir polyglot-benchmark
+#   --num-tests 10
diff --git a/src/benchmark.py b/src/benchmark.py
@@ -0,0 +1,98 @@
+import asyncio, aiohttp, argparse, time, json, sys, tiktoken
+from typing import List, Tuple
+from tqdm import tqdm
+
+ENC = tiktoken.get_encoding("cl100k_base")
+
+def build_payload(max_tokens: int = 128):
+    prompt = "Implement a Python function that reverses a linked list."
+    return {
+        "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.3,
+        "stream": True
+    }
+
+async def send_one(session, url, payload, stats: List):
+    start = time.perf_counter()
+    output_tokens = 0
+    try:
+        async with session.post(url, json=payload) as resp:
+            if resp.status != 200:
+                stats.append(("http_error", 0, 0, 0, 0))
+                return
+            ttft = None
+            async for line in resp.content:
+                if line.startswith(b"data: "):
+                    data = line[6:]
+                    if data.strip() == b"[DONE]":
+                        break
+                    try:
+                        chunk = json.loads(data)
+                    except:
+                        continue
+                    if "choices" in chunk and len(chunk["choices"]) > 0:
+                        delta = chunk["choices"][0].get("delta", {})
+                        if "content" in delta:
+                            if ttft is None:
+                                ttft = time.perf_counter() - start
+                            output_tokens += 1
+            total = time.perf_counter() - start
+            stats.append(("ok", ttft or 0, total, output_tokens, 1))
+    except Exception as e:
+        stats.append(("exception", 0, 0, 0, 0))
+        print(e, file=sys.stderr)
+
+async def main(args):
+    url = f"http://{args.host}:{args.port}/v1/chat/completions"
+    payload = build_payload(args.max_tokens)
+
+    stats: List[Tuple[str, float, float, int, int]] = []
+
+    async with aiohttp.ClientSession(
+        connector=aiohttp.TCPConnector(limit=args.concurrent)
+    ) as session:
+        t0 = time.perf_counter()
+        tasks = [
+            asyncio.create_task(send_one(session, url, payload, stats))
+            for _ in range(args.requests)
+        ]
+        for f in tqdm(asyncio.as_completed(tasks), total=args.requests):
+            await f
+        total_wall = time.perf_counter() - t0
+
+    ok = [s for s in stats if s[0] == "ok"]
+    total_out_tokens = sum(s[3] for s in ok)
+    total_success = len(ok)
+
+    if total_success == 0:
+        print("All requests failed.")
+        return
+
+    # 指标
+    qps = total_success / total_wall
+    token_qps = total_out_tokens / total_wall
+    avg_ttft = sum(s[1] for s in ok) / total_success
+    avg_tpot = (sum(s[2] for s in ok) - sum(s[1] for s in ok)) / total_out_tokens
+
+    print("="*60)
+    print(f"并发数        : {args.concurrent}")
+    print(f"请求数        : {args.requests}")
+    print(f"成功请求      : {total_success}")
+    print(f"总输出 tokens : {total_out_tokens}")
+    print(f"QPS (req/s)   : {qps:.2f}")
+    print(f"Token QPS     : {token_qps:.2f} tokens/s")
+    print(f"平均 TTFT     : {avg_ttft*1000:.1f} ms")
+    print(f"平均 TPOT     : {avg_tpot*1000:.1f} ms")
+    print("="*60)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=38468)
+    parser.add_argument("--concurrent", type=int, default=512)
+    parser.add_argument("--requests", type=int, default=512)
+    parser.add_argument("--max-tokens", type=int, default=128)
+    args = parser.parse_args()
+    asyncio.run(main(args))
diff --git a/src/serving.py b/src/serving.py
@@ -0,0 +1,132 @@
+'''
+from sglang.test.test_utils import is_in_ci
+
+if is_in_ci():
+    from patch import launch_server_cmd
+else:
+    from sglang.utils import launch_server_cmd
+
+from sglang.utils import wait_for_server, print_highlight, terminate_process
+
+server_process, port = launch_server_cmd(
+    "python3 -m sglang.launch_server "
+    "--model-path Qwen/Qwen2.5-Coder-0.5B-Instruct "
+    "--host 0.0.0.0 "
+    "--mem-fraction-static 0.85 "
+    "--context-length 32768 "
+    "--max-total-tokens 196608 "
+    "--max-prefill-tokens 49152 "
+    "--max-running-requests 1024 "
+    "--attention-backend flashinfer "
+    "--trust-remote-code "
+    "--prefill-attention-backend flashinfer "
+    "--decode-attention-backend flashinfer "
+    "--chunked-prefill-size 4096 "
+    "--watchdog-timeout 600 "
+    "--enable-torch-compile"
+)
+
+wait_for_server(f"http://localhost:{port}")
+print(f"Server started on http://localhost:{port}")
+
+'''
+
+from __future__ import annotations
+import dataclasses
+import subprocess
+import time
+import typing as T
+
+def _import_launcher():
+    from sglang.test.test_utils import is_in_ci
+    if is_in_ci():
+        from patch import launch_server_cmd 
+    else:
+        from sglang.utils import launch_server_cmd
+    return launch_server_cmd
+
+@dataclasses.dataclass
+class ServerConfig:
+    model_path: str = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+    host: str = "0.0.0.0"
+    mem_fraction_static: float = 0.85
+    context_length: int = 32_768
+    max_total_tokens: int = 196_608
+    max_prefill_tokens: int = 49_152
+    max_running_requests: int = 1024
+    attention_backend: str = "flashinfer"
+    trust_remote_code: bool = True
+    prefill_attention_backend: str = "flashinfer"
+    decode_attention_backend: str = "flashinfer"
+    chunked_prefill_size: int = 4096
+    watchdog_timeout: int = 600
+    enable_torch_compile: bool = True
+
+    def to_cli_args(self) -> str:
+        flags = [
+            f"--model-path {self.model_path}",
+            f"--host {self.host}",
+            f"--mem-fraction-static {self.mem_fraction_static}",
+            f"--context-length {self.context_length}",
+            f"--max-total-tokens {self.max_total_tokens}",
+            f"--max-prefill-tokens {self.max_prefill_tokens}",
+            f"--max-running-requests {self.max_running_requests}",
+            f"--attention-backend {self.attention_backend}",
+            f"--prefill-attention-backend {self.prefill_attention_backend}",
+            f"--decode-attention-backend {self.decode_attention_backend}",
+            f"--chunked-prefill-size {self.chunked_prefill_size}",
+            f"--watchdog-timeout {self.watchdog_timeout}",
+        ]
+        if self.trust_remote_code:
+            flags.append("--trust-remote-code")
+        if self.enable_torch_compile:
+            flags.append("--enable-torch-compile")
+        return " ".join(flags)
+
+class LLMServer:
+    def __init__(self, cfg: ServerConfig) -> None:
+        self.cfg = cfg
+        self._proc: T.Optional[subprocess.Popen] = None
+        self._port: T.Optional[int] = None
+
+    def start(self) -> str:
+        launch_server_cmd = _import_launcher()
+        from sglang.utils import wait_for_server, print_highlight
+
+        cmd = f"python3 -m sglang.launch_server {self.cfg.to_cli_args()}"
+        self._proc, self._port = launch_server_cmd(cmd)
+        endpoint = f"http://localhost:{self._port}"
+
+        try:
+            wait_for_server(endpoint)
+            print_highlight(f"✅ Server ready at {endpoint}")
+            return endpoint
+        except Exception:
+            self.stop()
+            raise RuntimeError("Server failed to start.")
+
+    def stop(self) -> None:
+        if self._proc and self._proc.poll() is None:
+            from sglang.utils import terminate_process
+            terminate_process(self._proc)
+            self._proc.wait()
+            print("🛑 Server terminated.")
+        self._proc = None
+        self._port = None
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.stop()
+
+if __name__ == "__main__":
+    cfg = ServerConfig() 
+    server = LLMServer(cfg)
+    try:
+        server.start()
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        server.stop()