diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md index 6173907f6..fb5792dc5 100644 --- a/docs/getting-started/benchmark.md +++ b/docs/getting-started/benchmark.md @@ -166,6 +166,88 @@ guidellm benchmark --profile sweep | `--rate` | Number of strategies to run in the sweep (including synchronous and throughput). | `--rate 10` | | `--rampup` | Rate rampup duration in seconds for throughput and constant strategy steps. | `--rampup 10` | +##### Sweep Profile Configuration + +The sweep profile includes advanced configuration options for optimizing benchmarks on CPU-based deployments. These parameters help manage saturation detection and prevent graph artifacts: + +**Available Parameters:** + +| Parameter | Description | Default | Environment Variable | +| ----------------------------- | ------------------------------------------------- | ------- | ------------------------------------- | +| `--exclude-throughput-target` | Stop constant-rate tests before throughput level | `false` | `GUIDELLM__EXCLUDE_THROUGHPUT_TARGET` | +| `--exclude-throughput-result` | Exclude throughput benchmark from saved results | `false` | `GUIDELLM__EXCLUDE_THROUGHPUT_RESULT` | +| `--saturation-threshold` | Efficiency threshold for stopping sweep (0.0-1.0) | `0.98` | `GUIDELLM__SATURATION_THRESHOLD` | + +**When to Use:** + +- **CPU based system under test**: Enable `exclude-throughput-target` and `exclude-throughput-result` to prevent anomalous data points in performance graphs (TTFT spikes, inter-token latency anomalies) +- **GPU based system under test**: Use default settings (all disabled) + +**Example for CPU-optimized benchmarking:** + +```bash +guidellm benchmark \ + --target "http://localhost:8000" \ + --profile sweep \ + --exclude-throughput-target true \ + --exclude-throughput-result true \ + --saturation-threshold 0.98 \ + --data "prompt_tokens=256,output_tokens=128" \ + --max-seconds 300 +``` + +**Using Environment Variables:** + +```bash +export GUIDELLM__EXCLUDE_THROUGHPUT_TARGET=true +export GUIDELLM__EXCLUDE_THROUGHPUT_RESULT=true +export GUIDELLM__SATURATION_THRESHOLD=0.98 + +guidellm benchmark \ + --target "http://localhost:8000" \ + --profile sweep \ + --data "prompt_tokens=256,output_tokens=128" +``` + +**How It Works:** + +The sweep profile runs tests in this order: + +1. **Synchronous test**: Measures baseline single-request performance +2. **Throughput test**: Discovers maximum server capacity with parallel requests +3. **Constant-rate tests**: Tests at interpolated rates between synchronous and throughput + +Each parameter optimizes a different aspect: + +- **`exclude-throughput-target`**: Prevents generating a constant-rate test at the throughput level itself + + - **Why**: The highest constant-rate test would target the same rate as the throughput test, creating redundant "elbow" artifacts in graphs + - **Effect**: Stops constant-rate tests just before reaching throughput rate + +- **`exclude-throughput-result`**: Removes the throughput benchmark from saved results + + - **Why**: Throughput tests measure burst capacity with severe queuing (e.g., 23+ second TTFT), creating extreme outliers in graphs + - **Effect**: Graphs only show sustainable performance metrics from constant-rate tests + +- **`saturation-threshold`**: Stops the sweep when efficiency drops below threshold + + - **Why**: Once saturation is detected (achieved rate < target rate × threshold), further tests provide diminishing returns + - **Effect**: Saves time by stopping early when the server can no longer meet target rates + +**Why use all three together?** + +For CPU based system under test, all three parameters work synergistically: + +- `saturation-threshold` stops the sweep efficiently when saturation is detected +- `exclude-throughput-target` prevents testing at the unsustainable throughput rate +- `exclude-throughput-result` removes the anomalous throughput spike from graphs + +This combination produces clean, efficient benchmarks that focus on sustainable performance ranges. + +**Important Note:** + +Do not set `--max-concurrency` or `GUIDELLM__MAX_CONCURRENCY` when running sweep tests. The sweep profile uses the throughput test to discover the server's true capacity, and artificially limiting concurrency will result in an underestimated throughput measurement. This causes the constant-rate tests to run at rates far below the actual server capacity, preventing proper saturation detection and producing misleading results where TTFT may decrease instead of increase. + ## Data Options ### Synthetic Data Options diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index 1f0ed3043..f70a5664c 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -565,7 +565,14 @@ async def benchmark_generative_text( prefer_response_metrics=args.prefer_response_metrics, ): if benchmark: - report.benchmarks.append(benchmark) + # Check if we should exclude the throughput benchmark + should_exclude = ( + hasattr(profile, "exclude_throughput_result") + and profile.exclude_throughput_result + and benchmark.config.strategy.type_ == "throughput" + ) + if not should_exclude: + report.benchmarks.append(benchmark) output_format_results = {} for key, output in output_formats.items(): diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py index 054356c10..68cfa0362 100644 --- a/src/guidellm/benchmark/profiles.py +++ b/src/guidellm/benchmark/profiles.py @@ -595,6 +595,35 @@ class SweepProfile(Profile): default=42, description="Random seed for Poisson distribution strategy", ) + exclude_throughput_target: bool = Field( + default=False, + description=( + "Exclude constant-rate test at throughput level. " + "When True, constant-rate tests stop before reaching throughput rate, " + "preventing 'elbow' artifacts in performance graphs. " + "Recommended for CPU-based deployments." + ), + ) + exclude_throughput_result: bool = Field( + default=False, + description=( + "Exclude throughput benchmark from saved results. " + "When True, the throughput benchmark is not saved to the report, " + "preventing anomalous data points in graphs. " + "Recommended for CPU based system under test when saturation is detected." + ), + ) + saturation_threshold: float = Field( + default=0.98, + ge=0.0, + le=1.0, + description=( + "Efficiency threshold for saturation detection (achieved/target rate). " + "Sweep stops when efficiency drops below this value. " + "Default 0.98 (98%) is recommended for CPU based system under test. " + "Use 0.95 (95%) for noisier systems, 0.99 (99%) for very stable systems." + ), + ) synchronous_rate: float = Field( default=-1.0, description="Measured rate from synchronous strategy execution", @@ -634,6 +663,24 @@ def resolve_args( kwargs["random_seed"] = random_seed if rate_type in ["constant", "poisson"]: kwargs["strategy_type"] = rate_type + + # Resolve sweep profile parameters from settings if not provided + if ( + "exclude_throughput_target" not in kwargs + or kwargs["exclude_throughput_target"] is None + ): + kwargs["exclude_throughput_target"] = settings.exclude_throughput_target + if ( + "exclude_throughput_result" not in kwargs + or kwargs["exclude_throughput_result"] is None + ): + kwargs["exclude_throughput_result"] = settings.exclude_throughput_result + if ( + "saturation_threshold" not in kwargs + or kwargs["saturation_threshold"] is None + ): + kwargs["saturation_threshold"] = settings.saturation_threshold + return kwargs @property @@ -645,7 +692,7 @@ def strategy_types(self) -> list[str]: types += [self.strategy_type] * (self.sweep_size - len(types)) return types - def next_strategy( + def next_strategy( # noqa: C901 self, prev_strategy: SchedulingStrategy | None, prev_benchmark: Benchmark | None, @@ -685,13 +732,57 @@ def next_strategy( "Invalid rates in sweep; aborting. " "Were there any successful requests?" ) - self.measured_rates = list( - np.linspace( - self.synchronous_rate, - self.throughput_rate, - self.sweep_size - 1, - ) - )[1:] # don't rerun synchronous + + # Generate interpolated rates between synchronous and throughput. + # The behavior depends on exclude_throughput_target setting: + # + # When exclude_throughput_target=False (default, GPU mode): + # - Generate (sweep_size - 1) points from sync to throughput + # - Remove sync (already tested), keep throughput-level test + # - Example: sweep_size=10 -> 9 points, remove 1 = 8 async tests + # - Last async test targets throughput_rate + # + # When exclude_throughput_target=True (CPU mode): + # - Generate (sweep_size) points from sync to throughput + # - Remove sync AND throughput-level test + # - Example: sweep_size=10 -> 10 points, remove 2 = 8 async tests + # - Last async test stops before throughput_rate + # - Prevents "elbow" artifact in graphs + if self.exclude_throughput_target: + # CPU mode: stop before throughput level + self.measured_rates = list( + np.linspace( + self.synchronous_rate, + self.throughput_rate, + self.sweep_size, + ) + )[1:-1] + else: + # GPU mode: include throughput level + self.measured_rates = list( + np.linspace( + self.synchronous_rate, + self.throughput_rate, + self.sweep_size - 1, + ) + )[1:] + + # Check for saturation: if the previous constant-rate test couldn't + # achieve its target rate, the system has saturated + if ( + prev_strategy + and prev_strategy.type_ in ["constant", "poisson"] + and prev_benchmark + and hasattr(prev_strategy, "rate") + and hasattr(prev_benchmark, "metrics") + ): + target_rate = prev_strategy.rate # type: ignore[attr-defined] + achieved_rate = prev_benchmark.metrics.requests_per_second.successful.mean # type: ignore[attr-defined] + + # If achieved rate is below threshold, system is saturated + if achieved_rate < (target_rate * self.saturation_threshold): + # System saturated - don't test higher rates + return None next_index = ( len(self.completed_strategies) - 1 - 1 diff --git a/src/guidellm/benchmark/schemas/generative/entrypoints.py b/src/guidellm/benchmark/schemas/generative/entrypoints.py index f04365992..6a21d603d 100644 --- a/src/guidellm/benchmark/schemas/generative/entrypoints.py +++ b/src/guidellm/benchmark/schemas/generative/entrypoints.py @@ -235,6 +235,33 @@ def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any: default=None, description="Additional dataloader configuration arguments" ) random_seed: int = Field(default=42, description="Random seed for reproducibility") + # Sweep profile configuration + exclude_throughput_target: bool | None = Field( + default=None, + description=( + "Exclude constant-rate test at throughput level. " + "When True, constant-rate tests stop before reaching throughput rate. " + "Recommended for CPU-based deployments." + ), + ) + exclude_throughput_result: bool | None = Field( + default=None, + description=( + "Exclude throughput benchmark from saved results. " + "When True, throughput benchmark is not saved to the report. " + "Recommended for CPU-based deployments when saturation is detected." + ), + ) + saturation_threshold: float | None = Field( + default=None, + ge=0.0, + le=1.0, + description=( + "Efficiency threshold for saturation detection (achieved/target rate). " + "Sweep stops when efficiency drops below this value. " + "Default 0.98 (98%) is recommended for CPU based system under test." + ), + ) # Output configuration outputs: list[str] | tuple[str] = Field( default_factory=lambda: ["json", "csv"], diff --git a/src/guidellm/data/builders.py b/src/guidellm/data/builders.py index a75513cbd..b902d6510 100644 --- a/src/guidellm/data/builders.py +++ b/src/guidellm/data/builders.py @@ -320,13 +320,10 @@ def _extract_column_names( except (KeyError, IndexError): prefix_column = None - try: - output_mappings = column_mapper.datasets_column_mappings[ - ("output_tokens_count_column", 0) - ] - output_column = output_mappings[0][1] - except (KeyError, IndexError): - output_column = "output_tokens_count" + output_mappings = column_mapper.datasets_column_mappings.get( + ("output_tokens_count_column", 0), [] + ) + output_column = output_mappings[0][1] if output_mappings else "output_tokens_count" return prompt_column, prefix_column, output_column diff --git a/src/guidellm/settings.py b/src/guidellm/settings.py index f16ac5b6c..74a209a98 100644 --- a/src/guidellm/settings.py +++ b/src/guidellm/settings.py @@ -105,6 +105,11 @@ class Settings(BaseSettings): constraint_error_window_size: float = 30 constraint_error_min_processed: float = 30 + # Sweep profile settings + exclude_throughput_target: bool = False + exclude_throughput_result: bool = False + saturation_threshold: float = 0.98 + # Data settings dataset: DatasetSettings = DatasetSettings()