From 492dee86117ff4c925cef32d128965c9dbaf537c Mon Sep 17 00:00:00 2001
From: alec-flowers <aflowers@nvidia.com>
Date: Fri, 3 Oct 2025 19:34:09 -0700
Subject: [PATCH 1/4] bettter logging, decrease randomness

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 tests/utils/payload_builder.py | 12 ++++++------
 tests/utils/payloads.py        |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py
index 0e13d62e40..4e0b3c96e1 100644
--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -11,11 +11,11 @@
 
 
 def chat_payload_default(
-    repeat_count: int = 3,
+    repeat_count: int = 2,
     expected_response: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
-    max_tokens: int = 150,
-    temperature: float = 0.1,
+    max_tokens: int = 300,
+    temperature: float = 0,
     stream: bool = False,
 ) -> ChatPayload:
     return ChatPayload(
@@ -37,11 +37,11 @@ def chat_payload_default(
 
 
 def completion_payload_default(
-    repeat_count: int = 3,
+    repeat_count: int = 2,
     expected_response: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
-    max_tokens: int = 150,
-    temperature: float = 0.1,
+    max_tokens: int = 300,
+    temperature: float = 0,
     stream: bool = False,
 ) -> CompletionPayload:
     return CompletionPayload(
diff --git a/tests/utils/payloads.py b/tests/utils/payloads.py
index 8e5957e208..0cbb854c99 100644
--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -63,8 +63,10 @@ def validate(self, response: Any, content: str) -> None:
                 if not content or expected not in content:
                     missing_expected.append(expected)
             if missing_expected:
+                preview = (content or "")[:1000]
                 raise AssertionError(
-                    f"Expected content not found in response. Missing: {missing_expected}"
+                    f"Expected content not found in response. Missing: {missing_expected}. "
+                    f"Content preview (first 1000 chars): {preview!r}"
                 )
         logger.info(f"SUCCESS: All expected_responses: {self.expected_response} found.")
 

From 3097b19a7a116abf0b14edaedd6b9210da046c4c Mon Sep 17 00:00:00 2001
From: alec-flowers <aflowers@nvidia.com>
Date: Sat, 8 Nov 2025 00:29:03 +0000
Subject: [PATCH 2/4] Add or and more matching params

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 tests/serve/test_sglang.py     | 16 ++++++------
 tests/serve/test_trtllm.py     |  8 +++---
 tests/serve/test_vllm.py       | 16 ++++++------
 tests/utils/payload_builder.py | 47 ++++++++++++++++++++++------------
 tests/utils/payloads.py        | 43 ++++++++++++++++++++++++++-----
 5 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/tests/serve/test_sglang.py b/tests/serve/test_sglang.py
index b698265a5a..8b2c1afb78 100644
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -51,7 +51,7 @@ class SGLangConfig(EngineConfig):
         request_payloads=[
             chat_payload_default(),
             completion_payload_default(),
-            metric_payload_default(min_num_requests=6, backend="sglang"),
+            metric_payload_default(min_num_requests=4, backend="sglang"),
         ],
     ),
     "disaggregated": SGLangConfig(
@@ -78,9 +78,9 @@ class SGLangConfig(EngineConfig):
             chat_payload_default(),
             completion_payload_default(),
             # Validate dynamo_component_* and sglang:* metrics from prefill worker (port 8081)
-            metric_payload_default(min_num_requests=6, backend="sglang", port=8081),
+            metric_payload_default(min_num_requests=4, backend="sglang", port=8081),
             # Validate dynamo_component_* and sglang:* metrics from decode worker (port 8082)
-            metric_payload_default(min_num_requests=6, backend="sglang", port=8082),
+            metric_payload_default(min_num_requests=4, backend="sglang", port=8082),
         ],
     ),
     "kv_events": SGLangConfig(
@@ -119,7 +119,7 @@ class SGLangConfig(EngineConfig):
         models_port=8000,
         request_payloads=[
             chat_payload_default(
-                expected_response=["Successfully Applied Chat Template"]
+                expected_response_all=["Successfully Applied Chat Template"]
             )
         ],
     ),
@@ -147,7 +147,7 @@ class SGLangConfig(EngineConfig):
                 # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                 # so we need something consistently found in the response, or a different
                 # approach to validation for this test to be stable.
-                expected_response=["image"],
+                expected_response_any=["image", "bus", "train", "streetcar"],
                 temperature=0.0,
             )
         ],
@@ -165,13 +165,13 @@ class SGLangConfig(EngineConfig):
             # Test default payload with multiple inputs
             embedding_payload_default(
                 repeat_count=2,
-                expected_response=["Generated 2 embeddings with dimension"],
+                expected_response_all=["Generated 2 embeddings with dimension"],
             ),
             # Test single string input
             embedding_payload(
                 input_text="Hello, world!",
                 repeat_count=1,
-                expected_response=["Generated 1 embeddings with dimension"],
+                expected_response_all=["Generated 1 embeddings with dimension"],
             ),
             # Test multiple string inputs
             embedding_payload(
@@ -181,7 +181,7 @@ class SGLangConfig(EngineConfig):
                     "Natural language processing enables computers to understand text.",
                 ],
                 repeat_count=1,
-                expected_response=["Generated 3 embeddings with dimension"],
+                expected_response_all=["Generated 3 embeddings with dimension"],
             ),
         ],
     ),
diff --git a/tests/serve/test_trtllm.py b/tests/serve/test_trtllm.py
index 99b7243d82..e7e7a8176d 100644
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -45,7 +45,7 @@ class TRTLLMConfig(EngineConfig):
         request_payloads=[
             chat_payload_default(),
             completion_payload_default(),
-            metric_payload_default(min_num_requests=6, backend="trtllm"),
+            metric_payload_default(min_num_requests=4, backend="trtllm"),
         ],
     ),
     "disaggregated": TRTLLMConfig(
@@ -70,8 +70,8 @@ class TRTLLMConfig(EngineConfig):
         request_payloads=[
             chat_payload_default(),
             completion_payload_default(),
-            metric_payload_default(port=8081, min_num_requests=6, backend="trtllm"),
-            metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"),
+            metric_payload_default(port=8081, min_num_requests=4, backend="trtllm"),
+            metric_payload_default(port=8082, min_num_requests=4, backend="trtllm"),
         ],
     ),
     "aggregated_router": TRTLLMConfig(
@@ -147,7 +147,7 @@ def test_chat_only_aggregated_with_test_logits_processor(
         script_name=base.script_name,  # agg.sh
         marks=[],  # not used by this direct test
         request_payloads=[
-            chat_payload_default(expected_response=["Hello world!"]),
+            chat_payload_default(expected_response_all=["Hello world!"]),
         ],
         model="Qwen/Qwen3-0.6B",
         delayed_start=base.delayed_start,
diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py
index ff3cbcccb3..60a704ebcf 100644
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -45,7 +45,7 @@ class VLLMConfig(EngineConfig):
         request_payloads=[
             chat_payload_default(),
             completion_payload_default(),
-            metric_payload_default(min_num_requests=6, backend="vllm"),
+            metric_payload_default(min_num_requests=4, backend="vllm"),
         ],
     ),
     "agg-router": VLLMConfig(
@@ -100,8 +100,8 @@ class VLLMConfig(EngineConfig):
         ],
         timeout=700,
         request_payloads=[
-            chat_payload_default(expected_response=["joke"]),
-            completion_payload_default(expected_response=["joke"]),
+            chat_payload_default(),
+            completion_payload_default(),
         ],
     ),
     "multimodal_agg_llava_epd": VLLMConfig(
@@ -123,7 +123,7 @@ class VLLMConfig(EngineConfig):
                     },
                 ],
                 repeat_count=1,
-                expected_response=["bus"],
+                expected_response_any=["bus", "image"],
                 temperature=0.0,
             )
         ],
@@ -149,7 +149,7 @@ class VLLMConfig(EngineConfig):
                     },
                 ],
                 repeat_count=1,
-                expected_response=["bus"],
+                expected_response_any=["bus", "image"],
             )
         ],
     ),
@@ -175,7 +175,7 @@ class VLLMConfig(EngineConfig):
                     },
                 ],
                 repeat_count=1,
-                expected_response=["bus"],
+                expected_response_any=["bus", "image"],
             ),
             # Base64 data URL test (1x1 PNG inline, avoids network fetch)
             chat_payload(
@@ -189,7 +189,7 @@ class VLLMConfig(EngineConfig):
                     },
                 ],
                 repeat_count=1,
-                expected_response=[],  # Just validate no error
+                expected_response_all=[],  # Just validate no error
             ),
         ],
     ),
@@ -215,7 +215,7 @@ class VLLMConfig(EngineConfig):
                     },
                 ],
                 repeat_count=1,
-                expected_response=["rabbit"],
+                expected_response_any=["rabbit", "video", "detail"],
                 temperature=0.7,
             )
         ],
diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py
index c6b175170e..7c4d061a36 100644
--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -17,7 +17,8 @@
 
 def chat_payload_default(
     repeat_count: int = 2,
-    expected_response: Optional[List[str]] = None,
+    expected_response_all: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
     max_tokens: int = 300,
     temperature: float = 0,
@@ -37,13 +38,15 @@ def chat_payload_default(
         },
         repeat_count=repeat_count,
         expected_log=expected_log or [],
-        expected_response=expected_response or ["AI"],
+        expected_response_all=expected_response_all or [],
+        expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"],
     )
 
 
 def completion_payload_default(
     repeat_count: int = 2,
-    expected_response: Optional[List[str]] = None,
+    expected_response_all: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
     max_tokens: int = 300,
     temperature: float = 0,
@@ -58,7 +61,8 @@ def completion_payload_default(
         },
         repeat_count=repeat_count,
         expected_log=expected_log or [],
-        expected_response=expected_response or ["AI"],
+        expected_response_all=expected_response_all or [],
+        expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"],
     )
 
 
@@ -73,7 +77,8 @@ def metric_payload_default(
         body={},
         repeat_count=repeat_count,
         expected_log=expected_log or [],
-        expected_response=[],
+        expected_response_all=[],
+        expected_response_any=[],
         min_num_requests=min_num_requests,
         backend=backend,
         port=port,
@@ -83,7 +88,8 @@ def metric_payload_default(
 def chat_payload(
     content: Union[str, List[Dict[str, Any]]],
     repeat_count: int = 1,
-    expected_response: Optional[List[str]] = None,
+    expected_response_all: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
     max_tokens: int = 300,
     temperature: Optional[float] = None,
@@ -106,14 +112,16 @@ def chat_payload(
         body=body,
         repeat_count=repeat_count,
         expected_log=expected_log or [],
-        expected_response=expected_response or [],
+        expected_response_all=expected_response_all or [],
+        expected_response_any=expected_response_any or [],
     )
 
 
 def completion_payload(
     prompt: str,
     repeat_count: int = 3,
-    expected_response: Optional[List[str]] = None,
+    expected_response_all: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
     max_tokens: int = 150,
     temperature: float = 0.1,
@@ -128,13 +136,15 @@ def completion_payload(
         },
         repeat_count=repeat_count,
         expected_log=expected_log or [],
-        expected_response=expected_response or [],
+        expected_response_all=expected_response_all or [],
+        expected_response_any=expected_response_any or [],
     )
 
 
 def embedding_payload_default(
     repeat_count: int = 3,
-    expected_response: Optional[List[str]] = None,
+    expected_response_all: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
 ) -> EmbeddingPayload:
     return EmbeddingPayload(
@@ -143,15 +153,17 @@ def embedding_payload_default(
         },
         repeat_count=repeat_count,
         expected_log=expected_log or [],
-        expected_response=expected_response
+        expected_response_all=expected_response_all
         or ["Generated 2 embeddings with dimension"],
+        expected_response_any=expected_response_any or [],
     )
 
 
 def embedding_payload(
     input_text: Union[str, List[str]],
     repeat_count: int = 3,
-    expected_response: Optional[List[str]] = None,
+    expected_response_all: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = None,
     expected_log: Optional[List[str]] = None,
 ) -> EmbeddingPayload:
     # Normalize input to list for consistent processing
@@ -168,8 +180,9 @@ def embedding_payload(
         },
         repeat_count=repeat_count,
         expected_log=expected_log or [],
-        expected_response=expected_response
+        expected_response_all=expected_response_all
         or [f"Generated {expected_count} embeddings with dimension"],
+        expected_response_any=expected_response_any or [],
     )
 
 
@@ -181,7 +194,8 @@ def make_chat_health_check(port: int, model: str):
     def _check_chat_endpoint(remaining_timeout: float = 30.0) -> bool:
         payload = chat_payload_default(
             repeat_count=1,
-            expected_response=[],
+            expected_response_all=[],
+            expected_response_any=[],
             max_tokens=8,
             temperature=0.0,
             stream=False,
@@ -195,7 +209,7 @@ def _check_chat_endpoint(remaining_timeout: float = 30.0) -> bool:
                 method=payload.method,
                 log_level=10,
             )
-            # Validate structure only; expected_response is empty
+            # Validate structure only; expected_response_all is empty
             _ = payload.response_handler(resp)
             return True
         except Exception:
@@ -208,7 +222,8 @@ def make_completions_health_check(port: int, model: str):
     def _check_completions_endpoint(remaining_timeout: float = 30.0) -> bool:
         payload = completion_payload_default(
             repeat_count=1,
-            expected_response=[],
+            expected_response_all=[],
+            expected_response_any=[],
             max_tokens=8,
             temperature=0.0,
             stream=False,
diff --git a/tests/utils/payloads.py b/tests/utils/payloads.py
index aea3024450..ebe4527149 100644
--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -30,8 +30,9 @@ class BasePayload:
     """Generic payload body plus expectations and repeat count."""
 
     body: Dict[str, Any]
-    expected_response: List[str]
     expected_log: List[str]
+    expected_response_all: List[str] = None  # All must match (AND logic)
+    expected_response_any: List[str] = None  # At least one must match (OR logic)
     repeat_count: int = 1
     timeout: int = 60
 
@@ -41,6 +42,13 @@ class BasePayload:
     endpoint: str = ""
     method: str = "POST"
 
+    def __post_init__(self):
+        # Initialize expected_response fields if None
+        if self.expected_response_all is None:
+            self.expected_response_all = []
+        if self.expected_response_any is None:
+            self.expected_response_any = []
+
     def url(self) -> str:
         ep = self.endpoint.lstrip("/")
         return f"http://{self.host}:{self.port}/{ep}"
@@ -51,24 +59,47 @@ def with_model(self, model):
             p.body = {**p.body, "model": model}
         return p
 
+    @property
+    def expected_response(self) -> List[str]:
+        """Backward compatibility - maps to expected_response_all"""
+        return self.expected_response_all
+
     def response_handler(self, response: Any) -> str:
         """Extract a text representation of the response for logging/validation."""
         raise NotImplementedError("Subclasses must implement response_handler()")
 
     def validate(self, response: Any, content: str) -> None:
-        """Default validation: ensure expected substrings appear in content."""
-        if self.expected_response:
+        """Validate expected substrings appear in content using AND/OR logic."""
+        # Check AND logic (all must be present)
+        if self.expected_response_all:
             missing_expected = []
-            for expected in self.expected_response:
+            for expected in self.expected_response_all:
                 if not content or expected not in content:
                     missing_expected.append(expected)
             if missing_expected:
                 preview = (content or "")[:1000]
                 raise AssertionError(
-                    f"Expected content not found in response. Missing: {missing_expected}. "
+                    f"Expected content (ALL) not found in response. Missing: {missing_expected}. "
+                    f"Content preview (first 1000 chars): {preview!r}"
+                )
+            logger.info(
+                f"SUCCESS: All expected_response_all: {self.expected_response_all} found."
+            )
+
+        # Check OR logic (at least one must be present)
+        if self.expected_response_any:
+            found = any(
+                expected in (content or "") for expected in self.expected_response_any
+            )
+            if not found:
+                preview = (content or "")[:1000]
+                raise AssertionError(
+                    f"Expected content (ANY) not found in response. None of {self.expected_response_any} matched. "
                     f"Content preview (first 1000 chars): {preview!r}"
                 )
-        logger.info(f"SUCCESS: All expected_responses: {self.expected_response} found.")
+            logger.info(
+                f"SUCCESS: At least one from expected_response_any: {self.expected_response_any} found."
+            )
 
     def process_response(self, response: Any) -> str:
         """Convenience: run response_handler then validate; return content."""

From be63149d1ded30b88ece322a5172f3ed3736b47c Mon Sep 17 00:00:00 2001
From: alec-flowers <aflowers@nvidia.com>
Date: Sat, 8 Nov 2025 01:20:27 +0000
Subject: [PATCH 3/4] fix mypy

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 tests/utils/payloads.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/utils/payloads.py b/tests/utils/payloads.py
index ebe4527149..1f888abdb9 100644
--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -31,8 +31,8 @@ class BasePayload:
 
     body: Dict[str, Any]
     expected_log: List[str]
-    expected_response_all: List[str] = None  # All must match (AND logic)
-    expected_response_any: List[str] = None  # At least one must match (OR logic)
+    expected_response_all: List[str] | None = None  # All must match (AND logic)
+    expected_response_any: List[str] | None = None  # At least one must match (OR logic)
     repeat_count: int = 1
     timeout: int = 60
 
@@ -60,7 +60,7 @@ def with_model(self, model):
         return p
 
     @property
-    def expected_response(self) -> List[str]:
+    def expected_response(self) -> List[str] | None:
         """Backward compatibility - maps to expected_response_all"""
         return self.expected_response_all
 

From c7093979e3d80845e01236b62be41ee7109e0aad Mon Sep 17 00:00:00 2001
From: alec-flowers <aflowers@nvidia.com>
Date: Sun, 9 Nov 2025 18:59:48 +0000
Subject: [PATCH 4/4] fix override issue

Signed-off-by: alec-flowers <aflowers@nvidia.com>
---
 tests/utils/payload_builder.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py
index 7c4d061a36..71362daa53 100644
--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -12,13 +12,15 @@
 )
 
 # Common default text prompt used across tests
-TEXT_PROMPT = "Tell me a short joke about AI."
+DEFAULT_TEXT_PROMPT = "Tell me a short joke about AI."
+DEFAULT_EXPECTED_RESPONSE_ANY = ("AI", "joke", "short", "robot")
+_UNSET: Any = object()
 
 
 def chat_payload_default(
     repeat_count: int = 2,
     expected_response_all: Optional[List[str]] = None,
-    expected_response_any: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = _UNSET,
     expected_log: Optional[List[str]] = None,
     max_tokens: int = 300,
     temperature: float = 0,
@@ -29,7 +31,7 @@ def chat_payload_default(
             "messages": [
                 {
                     "role": "user",
-                    "content": TEXT_PROMPT,
+                    "content": DEFAULT_TEXT_PROMPT,
                 }
             ],
             "max_tokens": max_tokens,
@@ -39,14 +41,18 @@ def chat_payload_default(
         repeat_count=repeat_count,
         expected_log=expected_log or [],
         expected_response_all=expected_response_all or [],
-        expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"],
+        expected_response_any=_resolve_expected_response_any(
+            expected_response_all,
+            expected_response_any,
+            DEFAULT_EXPECTED_RESPONSE_ANY,
+        ),
     )
 
 
 def completion_payload_default(
     repeat_count: int = 2,
     expected_response_all: Optional[List[str]] = None,
-    expected_response_any: Optional[List[str]] = None,
+    expected_response_any: Optional[List[str]] = _UNSET,
     expected_log: Optional[List[str]] = None,
     max_tokens: int = 300,
     temperature: float = 0,
@@ -54,7 +60,7 @@ def completion_payload_default(
 ) -> CompletionPayload:
     return CompletionPayload(
         body={
-            "prompt": TEXT_PROMPT,
+            "prompt": DEFAULT_TEXT_PROMPT,
             "max_tokens": max_tokens,
             "temperature": temperature,
             "stream": stream,
@@ -62,10 +68,24 @@ def completion_payload_default(
         repeat_count=repeat_count,
         expected_log=expected_log or [],
         expected_response_all=expected_response_all or [],
-        expected_response_any=expected_response_any or ["AI", "joke", "short", "robot"],
+        expected_response_any=_resolve_expected_response_any(
+            expected_response_all,
+            expected_response_any,
+            DEFAULT_EXPECTED_RESPONSE_ANY,
+        ),
     )
 
 
+def _resolve_expected_response_any(
+    expected_response_all: Optional[List[str]],
+    expected_response_any: Optional[List[str]],
+    default_any: List[str],
+) -> List[str]:
+    if expected_response_any is _UNSET:
+        return [] if expected_response_all is not None else list(default_any)
+    return expected_response_any or []
+
+
 def metric_payload_default(
     min_num_requests: int,
     repeat_count: int = 1,