diff --git a/bindings/python/src/smg/serve.py b/bindings/python/src/smg/serve.py index fe9a1167b..737bbdc76 100644 --- a/bindings/python/src/smg/serve.py +++ b/bindings/python/src/smg/serve.py @@ -185,7 +185,9 @@ def build_command( ): cmd.append("--enable-prompt-tokens-details") - cmd.extend(self._filter_backend_args(backend_args, ["--model", "--host", "--port"])) + cmd.extend( + self._filter_backend_args(backend_args, ["--model", "--host", "--port", "--uds"]) + ) return cmd @@ -384,15 +386,37 @@ def _add_sglang_args(parser: argparse.ArgumentParser) -> None: def _add_vllm_args(parser: argparse.ArgumentParser) -> None: - """Add vllm-specific arguments.""" + """Add vLLM engine CLI arguments.""" try: - from vllm.engine.arg_utils import EngineArgs + from vllm.engine import arg_utils as vllm_arg_utils + + engine_args_cls = getattr(vllm_arg_utils, "AsyncEngineArgs", None) or getattr( + vllm_arg_utils, "EngineArgs", None + ) + if engine_args_cls is None: + raise ImportError("vllm.engine.arg_utils is missing EngineArgs") - EngineArgs.add_cli_args(parser) + engine_args_cls.add_cli_args(parser) except ImportError: parser.error("vllm is not installed. Install it with: pip install vllm") +def _add_vllm_frontend_args(parser: argparse.ArgumentParser) -> None: + """Add vLLM OpenAI frontend arguments.""" + try: + from vllm.entrypoints.openai import cli_args as vllm_openai_cli_args + except ImportError: + vllm_openai_cli_args = None + + frontend_args_cls = ( + getattr(vllm_openai_cli_args, "FrontendArgs", None) + if vllm_openai_cli_args is not None + else None + ) + if frontend_args_cls is not None: + frontend_args_cls.add_cli_args(parser) + + def _add_trtllm_stub_args(parser: argparse.ArgumentParser) -> None: """Add TensorRT-LLM specific arguments. @@ -524,12 +548,16 @@ def parse_serve_args( ) add_serve_args(parser) _import_backend_args(backend, parser) + if backend == "vllm" and serve_router_args.connection_mode == "http": + logger.debug("Adding vLLM OpenAI frontend CLI arguments for HTTP workers.") + _add_vllm_frontend_args(parser) RouterArgs.add_cli_args(parser, use_router_prefix=True, exclude_host_port=True) if backend == "trtllm": args, _ = parser.parse_known_args(argv) else: args = parser.parse_args(argv) + return backend, args, backend_args diff --git a/bindings/python/tests/test_serve.py b/bindings/python/tests/test_serve.py index ef61936b3..08932a5aa 100644 --- a/bindings/python/tests/test_serve.py +++ b/bindings/python/tests/test_serve.py @@ -286,6 +286,73 @@ def test_vllm_explicit_exits(self): parse_serve_args(["--backend", "vllm"]) assert exc_info.value.code == 2 + def test_vllm_http_accepts_frontend_args(self): + def _mock_vllm_args(backend, parser): + if backend == "vllm": + parser.add_argument("--model", type=str) + else: + _import_backend_args(backend, parser) + + def _mock_vllm_frontend_args(parser): + parser.add_argument("--enable-auto-tool-choice", action="store_true") + parser.add_argument("--tool-call-parser", type=str) + + with ( + patch("smg.serve._import_backend_args", side_effect=_mock_vllm_args), + patch("smg.serve._add_vllm_frontend_args", side_effect=_mock_vllm_frontend_args), + ): + backend, args, backend_args = parse_serve_args( + [ + "--backend", + "vllm", + "--connection-mode", + "http", + "--model", + "/tmp/model", + "--enable-auto-tool-choice", + "--tool-call-parser", + "minimax_m2", + ] + ) + + assert backend == "vllm" + assert args.connection_mode == "http" + assert args.model == "/tmp/model" + assert args.enable_auto_tool_choice is True + assert args.tool_call_parser == "minimax_m2" + assert "--enable-auto-tool-choice" in backend_args + assert "--tool-call-parser" in backend_args + assert "minimax_m2" in backend_args + + def test_vllm_grpc_rejects_frontend_args(self): + def _mock_vllm_args(backend, parser): + if backend == "vllm": + parser.add_argument("--model", type=str) + else: + _import_backend_args(backend, parser) + + with ( + patch("smg.serve._import_backend_args", side_effect=_mock_vllm_args), + patch("smg.serve._add_vllm_frontend_args") as mock_frontend_args, + ): + with pytest.raises(SystemExit) as exc_info: + parse_serve_args( + [ + "--backend", + "vllm", + "--connection-mode", + "grpc", + "--model", + "/tmp/model", + "--enable-auto-tool-choice", + "--tool-call-parser", + "minimax_m2", + ] + ) + + assert exc_info.value.code == 2 + mock_frontend_args.assert_not_called() + def test_invalid_backend_exits(self): with pytest.raises(SystemExit): parse_serve_args(["--backend", "nonexistent"])