Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions bindings/python/src/smg/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ def build_command(
):
cmd.append("--enable-prompt-tokens-details")

cmd.extend(self._filter_backend_args(backend_args, ["--model", "--host", "--port"]))
cmd.extend(
self._filter_backend_args(backend_args, ["--model", "--host", "--port", "--uds"])
)

return cmd

Expand Down Expand Up @@ -384,15 +386,37 @@ def _add_sglang_args(parser: argparse.ArgumentParser) -> None:


def _add_vllm_args(parser: argparse.ArgumentParser) -> None:
"""Add vllm-specific arguments."""
"""Add vLLM engine CLI arguments."""
try:
from vllm.engine.arg_utils import EngineArgs
from vllm.engine import arg_utils as vllm_arg_utils

engine_args_cls = getattr(vllm_arg_utils, "AsyncEngineArgs", None) or getattr(
vllm_arg_utils, "EngineArgs", None
)
if engine_args_cls is None:
raise ImportError("vllm.engine.arg_utils is missing EngineArgs")

EngineArgs.add_cli_args(parser)
engine_args_cls.add_cli_args(parser)
except ImportError:
parser.error("vllm is not installed. Install it with: pip install vllm")


def _add_vllm_frontend_args(parser: argparse.ArgumentParser) -> None:
"""Add vLLM OpenAI frontend arguments."""
try:
from vllm.entrypoints.openai import cli_args as vllm_openai_cli_args
except ImportError:
vllm_openai_cli_args = None

frontend_args_cls = (
getattr(vllm_openai_cli_args, "FrontendArgs", None)
if vllm_openai_cli_args is not None
else None
)
if frontend_args_cls is not None:
frontend_args_cls.add_cli_args(parser)


def _add_trtllm_stub_args(parser: argparse.ArgumentParser) -> None:
"""Add TensorRT-LLM specific arguments.

Expand Down Expand Up @@ -524,12 +548,16 @@ def parse_serve_args(
)
add_serve_args(parser)
_import_backend_args(backend, parser)
if backend == "vllm" and serve_router_args.connection_mode == "http":
logger.debug("Adding vLLM OpenAI frontend CLI arguments for HTTP workers.")
_add_vllm_frontend_args(parser)
RouterArgs.add_cli_args(parser, use_router_prefix=True, exclude_host_port=True)

if backend == "trtllm":
args, _ = parser.parse_known_args(argv)
else:
args = parser.parse_args(argv)

return backend, args, backend_args


Expand Down
67 changes: 67 additions & 0 deletions bindings/python/tests/test_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,73 @@ def test_vllm_explicit_exits(self):
parse_serve_args(["--backend", "vllm"])
assert exc_info.value.code == 2

def test_vllm_http_accepts_frontend_args(self):
def _mock_vllm_args(backend, parser):
if backend == "vllm":
parser.add_argument("--model", type=str)
else:
_import_backend_args(backend, parser)

def _mock_vllm_frontend_args(parser):
parser.add_argument("--enable-auto-tool-choice", action="store_true")
parser.add_argument("--tool-call-parser", type=str)

with (
patch("smg.serve._import_backend_args", side_effect=_mock_vllm_args),
patch("smg.serve._add_vllm_frontend_args", side_effect=_mock_vllm_frontend_args),
):
backend, args, backend_args = parse_serve_args(
[
"--backend",
"vllm",
"--connection-mode",
"http",
"--model",
"/tmp/model",
"--enable-auto-tool-choice",
"--tool-call-parser",
"minimax_m2",
]
)

assert backend == "vllm"
assert args.connection_mode == "http"
assert args.model == "/tmp/model"
assert args.enable_auto_tool_choice is True
assert args.tool_call_parser == "minimax_m2"
assert "--enable-auto-tool-choice" in backend_args
assert "--tool-call-parser" in backend_args
assert "minimax_m2" in backend_args

def test_vllm_grpc_rejects_frontend_args(self):
def _mock_vllm_args(backend, parser):
if backend == "vllm":
parser.add_argument("--model", type=str)
else:
_import_backend_args(backend, parser)

with (
patch("smg.serve._import_backend_args", side_effect=_mock_vllm_args),
patch("smg.serve._add_vllm_frontend_args") as mock_frontend_args,
):
with pytest.raises(SystemExit) as exc_info:
parse_serve_args(
[
"--backend",
"vllm",
"--connection-mode",
"grpc",
"--model",
"/tmp/model",
"--enable-auto-tool-choice",
"--tool-call-parser",
"minimax_m2",
]
)

assert exc_info.value.code == 2
mock_frontend_args.assert_not_called()

def test_invalid_backend_exits(self):
with pytest.raises(SystemExit):
parse_serve_args(["--backend", "nonexistent"])
Expand Down
Loading