Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion tests/fault_tolerance/deploy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,17 @@ The following failure types are defined in `scenarios.py`:
| `sglang_prefill_scheduler` | Terminate SGLang prefill scheduler process. | `SIGKILL` to `sglang::scheduler`| sglang only |
| `sglang_prefill_detokenizer` | Terminate SGLang prefill detokenizer process. | `SIGKILL` to `sglang::detokenizer`| sglang only |

#### Token Overflow Tests

In addition to process and pod failures, the suite includes tests for **token overflow**, where the model receives an input prompt larger than its configured `max_seq_len`. These tests are crucial for verifying that the system can gracefully reject invalid requests without crashing.

- **Failure Injection**: Unlike other tests, this failure is injected from the **client side**. The `aiperf` client is configured to send a batch of requests with oversized token lengths.
- **Two-Phase Execution**: These tests run in two distinct phases, creating separate log directories for each:
1. **`overflow` Phase**: Sends oversized requests. The expected outcome is a high rate of failed requests (rejections) as the server correctly identifies and blocks them.
2. **`recovery` Phase**: Immediately after the overflow phase, sends valid, normal-sized requests. The expected outcome is a high success rate, confirming that the system has recovered and remains operational.

The combined results of these two phases demonstrate both the system's ability to reject invalid inputs and its stability after handling them.

#### Example Scenario Breakdown

**Scenario**: `sglang-agg-tp-2-dp-1-decode_worker`
Expand Down Expand Up @@ -392,7 +403,6 @@ graph LR
style DecodePool stroke:#000,stroke-width:2px
```


#### Summary:


Expand Down Expand Up @@ -596,3 +606,5 @@ Test Group: vllm-agg-tp-1-dp-2
╘═══════════════════╧═══════════╧═══════════╧══════════╧═══════════╧══════════╧═══════════╧═══════════╧════════════╛

```


29 changes: 16 additions & 13 deletions tests/fault_tolerance/deploy/legacy_parse_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,14 +418,15 @@ def process_test_directory(test_dir, sla):
}


def main(logs_dir, tablefmt, log_paths=None, sla=None):
def main(logs_dir, tablefmt, log_paths=None, sla=None, print_output=True):
"""Main entry point for parsing legacy client results.

Args:
logs_dir: Base directory containing test results
tablefmt: Table format for output (e.g., "fancy_grid")
log_paths: Optional list of specific log paths to process
sla: Optional SLA threshold for latency violations
print_output: If True, print tables and summaries. If False, only return results.
"""
results = []

Expand Down Expand Up @@ -542,19 +543,21 @@ def main(logs_dir, tablefmt, log_paths=None, sla=None):
]
rows.append(row)

print(f"\nTest Group: {test_prefix}")
print(
tabulate(
rows,
headers,
tablefmt=tablefmt,
floatfmt=".2f",
missingval="N/A",
numalign="right",
stralign="center",
if print_output:
logging.info(f"\nTest Group: {test_prefix}")
logging.info(
"\n"
+ tabulate(
rows,
headers,
tablefmt=tablefmt,
floatfmt=".2f",
missingval="N/A",
numalign="right",
stralign="center",
)
)
)
print("\n" + "=" * 80)
logging.info("\n" + "=" * 80)


if __name__ == "__main__":
Expand Down
28 changes: 19 additions & 9 deletions tests/fault_tolerance/deploy/parse_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ def parse_test_results(
log_paths: Optional[List[str]] = None,
tablefmt: str = "grid",
sla: Optional[float] = None,
success_threshold: float = 90.0,
force_parser: Optional[str] = None,
print_output: bool = True,
) -> Any:
"""Auto-detect and parse test results using the appropriate parser.

Expand All @@ -116,8 +118,10 @@ def parse_test_results(
log_paths: List of log directories to process (for multiple directories)
tablefmt: Table format for output (e.g., "fancy_grid", "pipe")
sla: Optional SLA threshold for latency violations
success_threshold: Success rate threshold for pass/fail (default: 90.0)
force_parser: Optional override to force using a specific parser
("aiperf" or "legacy"). If not provided, auto-detection is used.
print_output: If True, print tables and summaries. If False, only return results.

Returns:
Results from the appropriate parser
Expand Down Expand Up @@ -189,13 +193,17 @@ def parse_test_results(
log_paths=log_paths,
tablefmt=tablefmt,
sla=sla,
success_threshold=success_threshold,
print_output=print_output,
)
else:
return parse_aiperf(
logs_dir=log_dir,
log_paths=None,
tablefmt=tablefmt,
sla=sla,
success_threshold=success_threshold,
print_output=print_output,
)

elif parser_type == "legacy":
Expand All @@ -209,13 +217,15 @@ def parse_test_results(
log_paths=log_paths,
tablefmt=tablefmt,
sla=sla,
print_output=print_output,
)
else:
return parse_legacy(
logs_dir=log_dir,
log_paths=None,
tablefmt=tablefmt,
sla=sla,
print_output=print_output,
)

else:
Expand Down Expand Up @@ -294,18 +304,18 @@ def print_result_info(log_dir: str) -> None:
"""
info = get_result_info(log_dir)

print(f"\nTest Results Information: {log_dir}")
print("=" * 60)
print(f"Result Type: {info['type'] or 'Unknown'}")
print(f"Client Count: {info['client_count']}")
print(f"Has Test Log: {info['has_test_log']}")
logging.info(f"\nTest Results Information: {log_dir}")
logging.info("=" * 60)
logging.info(f"Result Type: {info['type'] or 'Unknown'}")
logging.info(f"Client Count: {info['client_count']}")
logging.info(f"Has Test Log: {info['has_test_log']}")

if info["details"]:
print("\nDetails:")
logging.info("\nDetails:")
for key, value in info["details"].items():
print(f" {key}: {value}")
logging.info(f" {key}: {value}")

print("=" * 60)
logging.info("=" * 60)


if __name__ == "__main__":
Expand Down Expand Up @@ -354,7 +364,7 @@ def print_result_info(log_dir: str) -> None:
for log_path in args.log_paths:
print_result_info(log_path)
else:
print("Error: Must provide log_dir or --log-paths")
logging.error("Must provide log_dir or --log-paths")
else:
# Parse mode
try:
Expand Down
Loading
Loading