From 492dee86117ff4c925cef32d128965c9dbaf537c Mon Sep 17 00:00:00 2001 From: alec-flowers Date: Fri, 3 Oct 2025 19:34:09 -0700 Subject: [PATCH 1/4] bettter logging, decrease randomness Signed-off-by: alec-flowers --- tests/utils/payload_builder.py | 12 ++++++------ tests/utils/payloads.py | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py index 0e13d62e40..4e0b3c96e1 100644 --- a/tests/utils/payload_builder.py +++ b/tests/utils/payload_builder.py @@ -11,11 +11,11 @@ def chat_payload_default( - repeat_count: int = 3, + repeat_count: int = 2, expected_response: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, - max_tokens: int = 150, - temperature: float = 0.1, + max_tokens: int = 300, + temperature: float = 0, stream: bool = False, ) -> ChatPayload: return ChatPayload( @@ -37,11 +37,11 @@ def chat_payload_default( def completion_payload_default( - repeat_count: int = 3, + repeat_count: int = 2, expected_response: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, - max_tokens: int = 150, - temperature: float = 0.1, + max_tokens: int = 300, + temperature: float = 0, stream: bool = False, ) -> CompletionPayload: return CompletionPayload( diff --git a/tests/utils/payloads.py b/tests/utils/payloads.py index 8e5957e208..0cbb854c99 100644 --- a/tests/utils/payloads.py +++ b/tests/utils/payloads.py @@ -63,8 +63,10 @@ def validate(self, response: Any, content: str) -> None: if not content or expected not in content: missing_expected.append(expected) if missing_expected: + preview = (content or "")[:1000] raise AssertionError( - f"Expected content not found in response. Missing: {missing_expected}" + f"Expected content not found in response. Missing: {missing_expected}. " + f"Content preview (first 1000 chars): {preview!r}" ) logger.info(f"SUCCESS: All expected_responses: {self.expected_response} found.") From 3097b19a7a116abf0b14edaedd6b9210da046c4c Mon Sep 17 00:00:00 2001 From: alec-flowers Date: Sat, 8 Nov 2025 00:29:03 +0000 Subject: [PATCH 2/4] Add or and more matching params Signed-off-by: alec-flowers --- tests/serve/test_sglang.py | 16 ++++++------ tests/serve/test_trtllm.py | 8 +++--- tests/serve/test_vllm.py | 16 ++++++------ tests/utils/payload_builder.py | 47 ++++++++++++++++++++++------------ tests/utils/payloads.py | 43 ++++++++++++++++++++++++++----- 5 files changed, 88 insertions(+), 42 deletions(-) diff --git a/tests/serve/test_sglang.py b/tests/serve/test_sglang.py index b698265a5a..8b2c1afb78 100644 --- a/tests/serve/test_sglang.py +++ b/tests/serve/test_sglang.py @@ -51,7 +51,7 @@ class SGLangConfig(EngineConfig): request_payloads=[ chat_payload_default(), completion_payload_default(), - metric_payload_default(min_num_requests=6, backend="sglang"), + metric_payload_default(min_num_requests=4, backend="sglang"), ], ), "disaggregated": SGLangConfig( @@ -78,9 +78,9 @@ class SGLangConfig(EngineConfig): chat_payload_default(), completion_payload_default(), # Validate dynamo_component_* and sglang:* metrics from prefill worker (port 8081) - metric_payload_default(min_num_requests=6, backend="sglang", port=8081), + metric_payload_default(min_num_requests=4, backend="sglang", port=8081), # Validate dynamo_component_* and sglang:* metrics from decode worker (port 8082) - metric_payload_default(min_num_requests=6, backend="sglang", port=8082), + metric_payload_default(min_num_requests=4, backend="sglang", port=8082), ], ), "kv_events": SGLangConfig( @@ -119,7 +119,7 @@ class SGLangConfig(EngineConfig): models_port=8000, request_payloads=[ chat_payload_default( - expected_response=["Successfully Applied Chat Template"] + expected_response_all=["Successfully Applied Chat Template"] ) ], ), @@ -147,7 +147,7 @@ class SGLangConfig(EngineConfig): # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc. # so we need something consistently found in the response, or a different # approach to validation for this test to be stable. - expected_response=["image"], + expected_response_any=["image", "bus", "train", "streetcar"], temperature=0.0, ) ], @@ -165,13 +165,13 @@ class SGLangConfig(EngineConfig): # Test default payload with multiple inputs embedding_payload_default( repeat_count=2, - expected_response=["Generated 2 embeddings with dimension"], + expected_response_all=["Generated 2 embeddings with dimension"], ), # Test single string input embedding_payload( input_text="Hello, world!", repeat_count=1, - expected_response=["Generated 1 embeddings with dimension"], + expected_response_all=["Generated 1 embeddings with dimension"], ), # Test multiple string inputs embedding_payload( @@ -181,7 +181,7 @@ class SGLangConfig(EngineConfig): "Natural language processing enables computers to understand text.", ], repeat_count=1, - expected_response=["Generated 3 embeddings with dimension"], + expected_response_all=["Generated 3 embeddings with dimension"], ), ], ), diff --git a/tests/serve/test_trtllm.py b/tests/serve/test_trtllm.py index 99b7243d82..e7e7a8176d 100644 --- a/tests/serve/test_trtllm.py +++ b/tests/serve/test_trtllm.py @@ -45,7 +45,7 @@ class TRTLLMConfig(EngineConfig): request_payloads=[ chat_payload_default(), completion_payload_default(), - metric_payload_default(min_num_requests=6, backend="trtllm"), + metric_payload_default(min_num_requests=4, backend="trtllm"), ], ), "disaggregated": TRTLLMConfig( @@ -70,8 +70,8 @@ class TRTLLMConfig(EngineConfig): request_payloads=[ chat_payload_default(), completion_payload_default(), - metric_payload_default(port=8081, min_num_requests=6, backend="trtllm"), - metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"), + metric_payload_default(port=8081, min_num_requests=4, backend="trtllm"), + metric_payload_default(port=8082, min_num_requests=4, backend="trtllm"), ], ), "aggregated_router": TRTLLMConfig( @@ -147,7 +147,7 @@ def test_chat_only_aggregated_with_test_logits_processor( script_name=base.script_name, # agg.sh marks=[], # not used by this direct test request_payloads=[ - chat_payload_default(expected_response=["Hello world!"]), + chat_payload_default(expected_response_all=["Hello world!"]), ], model="Qwen/Qwen3-0.6B", delayed_start=base.delayed_start, diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py index ff3cbcccb3..60a704ebcf 100644 --- a/tests/serve/test_vllm.py +++ b/tests/serve/test_vllm.py @@ -45,7 +45,7 @@ class VLLMConfig(EngineConfig): request_payloads=[ chat_payload_default(), completion_payload_default(), - metric_payload_default(min_num_requests=6, backend="vllm"), + metric_payload_default(min_num_requests=4, backend="vllm"), ], ), "agg-router": VLLMConfig( @@ -100,8 +100,8 @@ class VLLMConfig(EngineConfig): ], timeout=700, request_payloads=[ - chat_payload_default(expected_response=["joke"]), - completion_payload_default(expected_response=["joke"]), + chat_payload_default(), + completion_payload_default(), ], ), "multimodal_agg_llava_epd": VLLMConfig( @@ -123,7 +123,7 @@ class VLLMConfig(EngineConfig): }, ], repeat_count=1, - expected_response=["bus"], + expected_response_any=["bus", "image"], temperature=0.0, ) ], @@ -149,7 +149,7 @@ class VLLMConfig(EngineConfig): }, ], repeat_count=1, - expected_response=["bus"], + expected_response_any=["bus", "image"], ) ], ), @@ -175,7 +175,7 @@ class VLLMConfig(EngineConfig): }, ], repeat_count=1, - expected_response=["bus"], + expected_response_any=["bus", "image"], ), # Base64 data URL test (1x1 PNG inline, avoids network fetch) chat_payload( @@ -189,7 +189,7 @@ class VLLMConfig(EngineConfig): }, ], repeat_count=1, - expected_response=[], # Just validate no error + expected_response_all=[], # Just validate no error ), ], ), @@ -215,7 +215,7 @@ class VLLMConfig(EngineConfig): }, ], repeat_count=1, - expected_response=["rabbit"], + expected_response_any=["rabbit", "video", "detail"], temperature=0.7, ) ], diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py index c6b175170e..7c4d061a36 100644 --- a/tests/utils/payload_builder.py +++ b/tests/utils/payload_builder.py @@ -17,7 +17,8 @@ def chat_payload_default( repeat_count: int = 2, - expected_response: Optional[List[str]] = None, + expected_response_all: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, max_tokens: int = 300, temperature: float = 0, @@ -37,13 +38,15 @@ def chat_payload_default( }, repeat_count=repeat_count, expected_log=expected_log or [], - expected_response=expected_response or ["AI"], + expected_response_all=expected_response_all or [], + expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"], ) def completion_payload_default( repeat_count: int = 2, - expected_response: Optional[List[str]] = None, + expected_response_all: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, max_tokens: int = 300, temperature: float = 0, @@ -58,7 +61,8 @@ def completion_payload_default( }, repeat_count=repeat_count, expected_log=expected_log or [], - expected_response=expected_response or ["AI"], + expected_response_all=expected_response_all or [], + expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"], ) @@ -73,7 +77,8 @@ def metric_payload_default( body={}, repeat_count=repeat_count, expected_log=expected_log or [], - expected_response=[], + expected_response_all=[], + expected_response_any=[], min_num_requests=min_num_requests, backend=backend, port=port, @@ -83,7 +88,8 @@ def metric_payload_default( def chat_payload( content: Union[str, List[Dict[str, Any]]], repeat_count: int = 1, - expected_response: Optional[List[str]] = None, + expected_response_all: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, max_tokens: int = 300, temperature: Optional[float] = None, @@ -106,14 +112,16 @@ def chat_payload( body=body, repeat_count=repeat_count, expected_log=expected_log or [], - expected_response=expected_response or [], + expected_response_all=expected_response_all or [], + expected_response_any=expected_response_any or [], ) def completion_payload( prompt: str, repeat_count: int = 3, - expected_response: Optional[List[str]] = None, + expected_response_all: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, max_tokens: int = 150, temperature: float = 0.1, @@ -128,13 +136,15 @@ def completion_payload( }, repeat_count=repeat_count, expected_log=expected_log or [], - expected_response=expected_response or [], + expected_response_all=expected_response_all or [], + expected_response_any=expected_response_any or [], ) def embedding_payload_default( repeat_count: int = 3, - expected_response: Optional[List[str]] = None, + expected_response_all: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, ) -> EmbeddingPayload: return EmbeddingPayload( @@ -143,15 +153,17 @@ def embedding_payload_default( }, repeat_count=repeat_count, expected_log=expected_log or [], - expected_response=expected_response + expected_response_all=expected_response_all or ["Generated 2 embeddings with dimension"], + expected_response_any=expected_response_any or [], ) def embedding_payload( input_text: Union[str, List[str]], repeat_count: int = 3, - expected_response: Optional[List[str]] = None, + expected_response_all: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = None, expected_log: Optional[List[str]] = None, ) -> EmbeddingPayload: # Normalize input to list for consistent processing @@ -168,8 +180,9 @@ def embedding_payload( }, repeat_count=repeat_count, expected_log=expected_log or [], - expected_response=expected_response + expected_response_all=expected_response_all or [f"Generated {expected_count} embeddings with dimension"], + expected_response_any=expected_response_any or [], ) @@ -181,7 +194,8 @@ def make_chat_health_check(port: int, model: str): def _check_chat_endpoint(remaining_timeout: float = 30.0) -> bool: payload = chat_payload_default( repeat_count=1, - expected_response=[], + expected_response_all=[], + expected_response_any=[], max_tokens=8, temperature=0.0, stream=False, @@ -195,7 +209,7 @@ def _check_chat_endpoint(remaining_timeout: float = 30.0) -> bool: method=payload.method, log_level=10, ) - # Validate structure only; expected_response is empty + # Validate structure only; expected_response_all is empty _ = payload.response_handler(resp) return True except Exception: @@ -208,7 +222,8 @@ def make_completions_health_check(port: int, model: str): def _check_completions_endpoint(remaining_timeout: float = 30.0) -> bool: payload = completion_payload_default( repeat_count=1, - expected_response=[], + expected_response_all=[], + expected_response_any=[], max_tokens=8, temperature=0.0, stream=False, diff --git a/tests/utils/payloads.py b/tests/utils/payloads.py index aea3024450..ebe4527149 100644 --- a/tests/utils/payloads.py +++ b/tests/utils/payloads.py @@ -30,8 +30,9 @@ class BasePayload: """Generic payload body plus expectations and repeat count.""" body: Dict[str, Any] - expected_response: List[str] expected_log: List[str] + expected_response_all: List[str] = None # All must match (AND logic) + expected_response_any: List[str] = None # At least one must match (OR logic) repeat_count: int = 1 timeout: int = 60 @@ -41,6 +42,13 @@ class BasePayload: endpoint: str = "" method: str = "POST" + def __post_init__(self): + # Initialize expected_response fields if None + if self.expected_response_all is None: + self.expected_response_all = [] + if self.expected_response_any is None: + self.expected_response_any = [] + def url(self) -> str: ep = self.endpoint.lstrip("/") return f"http://{self.host}:{self.port}/{ep}" @@ -51,24 +59,47 @@ def with_model(self, model): p.body = {**p.body, "model": model} return p + @property + def expected_response(self) -> List[str]: + """Backward compatibility - maps to expected_response_all""" + return self.expected_response_all + def response_handler(self, response: Any) -> str: """Extract a text representation of the response for logging/validation.""" raise NotImplementedError("Subclasses must implement response_handler()") def validate(self, response: Any, content: str) -> None: - """Default validation: ensure expected substrings appear in content.""" - if self.expected_response: + """Validate expected substrings appear in content using AND/OR logic.""" + # Check AND logic (all must be present) + if self.expected_response_all: missing_expected = [] - for expected in self.expected_response: + for expected in self.expected_response_all: if not content or expected not in content: missing_expected.append(expected) if missing_expected: preview = (content or "")[:1000] raise AssertionError( - f"Expected content not found in response. Missing: {missing_expected}. " + f"Expected content (ALL) not found in response. Missing: {missing_expected}. " + f"Content preview (first 1000 chars): {preview!r}" + ) + logger.info( + f"SUCCESS: All expected_response_all: {self.expected_response_all} found." + ) + + # Check OR logic (at least one must be present) + if self.expected_response_any: + found = any( + expected in (content or "") for expected in self.expected_response_any + ) + if not found: + preview = (content or "")[:1000] + raise AssertionError( + f"Expected content (ANY) not found in response. None of {self.expected_response_any} matched. " f"Content preview (first 1000 chars): {preview!r}" ) - logger.info(f"SUCCESS: All expected_responses: {self.expected_response} found.") + logger.info( + f"SUCCESS: At least one from expected_response_any: {self.expected_response_any} found." + ) def process_response(self, response: Any) -> str: """Convenience: run response_handler then validate; return content.""" From be63149d1ded30b88ece322a5172f3ed3736b47c Mon Sep 17 00:00:00 2001 From: alec-flowers Date: Sat, 8 Nov 2025 01:20:27 +0000 Subject: [PATCH 3/4] fix mypy Signed-off-by: alec-flowers --- tests/utils/payloads.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/utils/payloads.py b/tests/utils/payloads.py index ebe4527149..1f888abdb9 100644 --- a/tests/utils/payloads.py +++ b/tests/utils/payloads.py @@ -31,8 +31,8 @@ class BasePayload: body: Dict[str, Any] expected_log: List[str] - expected_response_all: List[str] = None # All must match (AND logic) - expected_response_any: List[str] = None # At least one must match (OR logic) + expected_response_all: List[str] | None = None # All must match (AND logic) + expected_response_any: List[str] | None = None # At least one must match (OR logic) repeat_count: int = 1 timeout: int = 60 @@ -60,7 +60,7 @@ def with_model(self, model): return p @property - def expected_response(self) -> List[str]: + def expected_response(self) -> List[str] | None: """Backward compatibility - maps to expected_response_all""" return self.expected_response_all From c7093979e3d80845e01236b62be41ee7109e0aad Mon Sep 17 00:00:00 2001 From: alec-flowers Date: Sun, 9 Nov 2025 18:59:48 +0000 Subject: [PATCH 4/4] fix override issue Signed-off-by: alec-flowers --- tests/utils/payload_builder.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py index 7c4d061a36..71362daa53 100644 --- a/tests/utils/payload_builder.py +++ b/tests/utils/payload_builder.py @@ -12,13 +12,15 @@ ) # Common default text prompt used across tests -TEXT_PROMPT = "Tell me a short joke about AI." +DEFAULT_TEXT_PROMPT = "Tell me a short joke about AI." +DEFAULT_EXPECTED_RESPONSE_ANY = ("AI", "joke", "short", "robot") +_UNSET: Any = object() def chat_payload_default( repeat_count: int = 2, expected_response_all: Optional[List[str]] = None, - expected_response_any: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = _UNSET, expected_log: Optional[List[str]] = None, max_tokens: int = 300, temperature: float = 0, @@ -29,7 +31,7 @@ def chat_payload_default( "messages": [ { "role": "user", - "content": TEXT_PROMPT, + "content": DEFAULT_TEXT_PROMPT, } ], "max_tokens": max_tokens, @@ -39,14 +41,18 @@ def chat_payload_default( repeat_count=repeat_count, expected_log=expected_log or [], expected_response_all=expected_response_all or [], - expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"], + expected_response_any=_resolve_expected_response_any( + expected_response_all, + expected_response_any, + DEFAULT_EXPECTED_RESPONSE_ANY, + ), ) def completion_payload_default( repeat_count: int = 2, expected_response_all: Optional[List[str]] = None, - expected_response_any: Optional[List[str]] = None, + expected_response_any: Optional[List[str]] = _UNSET, expected_log: Optional[List[str]] = None, max_tokens: int = 300, temperature: float = 0, @@ -54,7 +60,7 @@ def completion_payload_default( ) -> CompletionPayload: return CompletionPayload( body={ - "prompt": TEXT_PROMPT, + "prompt": DEFAULT_TEXT_PROMPT, "max_tokens": max_tokens, "temperature": temperature, "stream": stream, @@ -62,10 +68,24 @@ def completion_payload_default( repeat_count=repeat_count, expected_log=expected_log or [], expected_response_all=expected_response_all or [], - expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"], + expected_response_any=_resolve_expected_response_any( + expected_response_all, + expected_response_any, + DEFAULT_EXPECTED_RESPONSE_ANY, + ), ) +def _resolve_expected_response_any( + expected_response_all: Optional[List[str]], + expected_response_any: Optional[List[str]], + default_any: List[str], +) -> List[str]: + if expected_response_any is _UNSET: + return [] if expected_response_all is not None else list(default_any) + return expected_response_any or [] + + def metric_payload_default( min_num_requests: int, repeat_count: int = 1,