From 552375f3e4ab49a130b610f551d513d8660a045f Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sat, 28 Feb 2026 14:29:11 +0100
Subject: [PATCH 01/27] Add replay from trace strategy

Add trace replay capability to GuideLLM for reproducing real-world request
patterns from trace files. This enables time-based request rate replay
and synthetic prompt generation matching trace token counts.

- Add TraceReplayStrategy for scheduling requests at precise timestamps
- Add ReplayProfile for configuring trace-based benchmarking
- Add TraceSyntheticDatasetDeserializer for generating prompts from traces
- Support max_requests truncation to limit trace length

This is a minimal implementation to address issue 597. Full Mooncake
format support, E2E tests, and documentation will follow in subsequent PRs.

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/entrypoints.py         | 101 +++--
 src/guidellm/benchmark/profiles.py            | 121 +++++-
 src/guidellm/data/deserializers/__init__.py   |   2 +
 .../data/deserializers/trace_synthetic.py     | 159 +++++++
 src/guidellm/data/trace_io.py                 |  92 ++++
 src/guidellm/scheduler/__init__.py            |   4 +
 src/guidellm/scheduler/strategies.py          |  55 ++-
 tests/unit/benchmark/test_replay_profile.py   | 409 ++++++++++++++++++
 .../deserializers/test_trace_synthetic.py     | 134 ++++++
 tests/unit/scheduler/test_trace_replay.py     | 253 +++++++++++
 10 files changed, 1295 insertions(+), 35 deletions(-)
 create mode 100644 src/guidellm/data/deserializers/trace_synthetic.py
 create mode 100644 src/guidellm/data/trace_io.py
 create mode 100644 tests/unit/benchmark/test_replay_profile.py
 create mode 100644 tests/unit/data/deserializers/test_trace_synthetic.py
 create mode 100644 tests/unit/scheduler/test_trace_replay.py

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index 1f0ed3043..1cce293e8 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -250,6 +250,7 @@ async def resolve_request_loader(
     data_num_workers: int | None,
     random_seed: int,
     console: Console | None = None,
+    max_requests: int | None = None,
     **dataloader_kwargs: dict[str, Any] | None,
 ) -> DataLoader[GenerationRequest]:
     """
@@ -273,6 +274,7 @@ async def resolve_request_loader(
     :param data_num_workers: Number of worker processes for data loading
     :param random_seed: Seed for reproducible random operations
     :param console: Console instance for progress reporting, or None
+    :param max_requests: If set, first data source loads at most this many rows.
     :param dataloader_kwargs: Additional arguments passed to DataLoader initialization
     :return: Configured DataLoader instance for GenerationRequest objects
     :raises ValueError: If request formatter type is not registered in
@@ -309,6 +311,17 @@ async def resolve_request_loader(
         data_finalizer,
     )
 
+    # When max_requests is set, limit the first data source to that many rows at load
+    if max_requests is not None and data:
+        if max_requests < 1:
+            raise ValueError(
+                "max_requests must be >= 1 when set for data truncation, "
+                f"got {max_requests}"
+            )
+        data_args = list(data_args) if data_args else [{} for _ in data]
+        if len(data_args) >= 1:
+            data_args[0] = {**data_args[0], "max_rows": max_requests}
+
     request_loader: DataLoader[GenerationRequest] = DataLoader(
         data=data,
         data_args=data_args,
@@ -355,6 +368,7 @@ async def resolve_profile(
     max_global_error_rate: float | None,
     over_saturation: dict[str, Any] | None = None,
     console: Console | None = None,
+    data: list[Any] | None = None,
 ) -> Profile:
     """
     Resolve and configure a benchmark profile with rate and constraint settings.
@@ -376,6 +390,7 @@ async def resolve_profile(
     :param max_global_error_rate: Maximum global error rate threshold before stopping
     :param over_saturation: Over-saturation detection configuration (dict)
     :param console: Console instance for progress reporting, or None
+    :param data: Optional list of data sources.
     :return: Configured Profile instance ready for benchmarking
     :raises ValueError: If constraints are provided with a pre-configured Profile
     """
@@ -403,6 +418,7 @@ async def resolve_profile(
             random_seed=random_seed,
             rampup_duration=rampup,
             constraints={**constraints},
+            data=data,
         )
     elif constraints:
         raise ValueError(
@@ -489,24 +505,58 @@ async def benchmark_generative_text(
     processor = await resolve_processor(
         processor=args.processor, model=model, console=console
     )
-    request_loader = await resolve_request_loader(
-        data=args.data,
-        model=model,
-        data_args=args.data_args,
-        data_samples=args.data_samples,
-        processor=processor,
-        processor_args=args.processor_args,
-        data_column_mapper=args.data_column_mapper,
-        data_preprocessors=args.data_preprocessors,
-        data_preprocessors_kwargs=args.data_preprocessors_kwargs,
-        data_finalizer=args.data_finalizer,
-        data_collator=args.data_collator,
-        data_sampler=args.data_sampler,
-        data_num_workers=args.data_num_workers,
-        random_seed=args.random_seed,
-        console=console,
-        **(args.dataloader_kwargs or {}),
-    )
+
+    # Build common kwargs for resolve_profile and resolve_request_loader
+    profile_kwargs = {
+        "profile": args.profile,
+        "rate": args.rate,
+        "random_seed": args.random_seed,
+        "rampup": args.rampup,
+        "constraints": constraints,
+        "max_seconds": args.max_seconds,
+        "max_requests": args.max_requests,
+        "max_errors": args.max_errors,
+        "max_error_rate": args.max_error_rate,
+        "max_global_error_rate": args.max_global_error_rate,
+        "over_saturation": args.over_saturation,
+        "console": console,
+    }
+    loader_kwargs = {
+        "data": args.data,
+        "model": model,
+        "data_args": args.data_args,
+        "data_samples": args.data_samples,
+        "processor": processor,
+        "processor_args": args.processor_args,
+        "data_column_mapper": args.data_column_mapper,
+        "data_preprocessors": args.data_preprocessors,
+        "data_preprocessors_kwargs": args.data_preprocessors_kwargs,
+        "data_finalizer": args.data_finalizer,
+        "data_collator": args.data_collator,
+        "data_sampler": args.data_sampler,
+        "data_num_workers": args.data_num_workers,
+        "random_seed": args.random_seed,
+        "console": console,
+    }
+
+    # For replay profile: resolve profile first to apply max_seconds filtering,
+    # then use the filtered count for the data loader. This ensures the data
+    # loader and scheduler both work with the same filtered request count.
+    if args.profile == "replay":
+        profile = await resolve_profile(**profile_kwargs, data=args.data)  # type: ignore[arg-type]
+        effective_max_requests = (
+            profile.constraints.get("max_requests")
+            if profile.constraints
+            else args.max_requests
+        )
+        request_loader = await resolve_request_loader(
+            **loader_kwargs, max_requests=effective_max_requests
+        )  # type: ignore[arg-type]
+    else:
+        request_loader = await resolve_request_loader(
+            **loader_kwargs, max_requests=args.max_requests
+        )  # type: ignore[arg-type]
+        profile = await resolve_profile(**profile_kwargs, data=None)  # type: ignore[arg-type]
 
     warmup = TransientPhaseConfig.create_from_value(args.warmup)
     cooldown = TransientPhaseConfig.create_from_value(args.cooldown)
@@ -522,21 +572,6 @@ async def benchmark_generative_text(
             ),
             status="success",
         )
-
-    profile = await resolve_profile(
-        profile=args.profile,
-        rate=args.rate,
-        random_seed=args.random_seed,
-        rampup=args.rampup,
-        constraints=constraints,
-        max_seconds=args.max_seconds,
-        max_requests=args.max_requests,
-        max_errors=args.max_errors,
-        max_error_rate=args.max_error_rate,
-        max_global_error_rate=args.max_global_error_rate,
-        over_saturation=args.over_saturation,
-        console=console,
-    )
     output_formats = await resolve_output_formats(
         outputs=args.outputs, output_dir=args.output_dir, console=console
     )
diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index 054356c10..11c1de0fe 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -13,6 +13,7 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import Generator
+from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Any, ClassVar, Literal
 
 import numpy as np
@@ -37,6 +38,8 @@
     SchedulingStrategy,
     SynchronousStrategy,
     ThroughputStrategy,
+    TraceReplayStrategy,
+    load_relative_timestamps,
 )
 from guidellm.schemas import PydanticClassRegistryMixin
 
@@ -48,13 +51,14 @@
     "ConcurrentProfile",
     "Profile",
     "ProfileType",
+    "ReplayProfile",
     "SweepProfile",
     "SynchronousProfile",
     "ThroughputProfile",
 ]
 
 ProfileType = Annotated[
-    Literal["synchronous", "concurrent", "throughput", "async", "sweep"],
+    Literal["synchronous", "concurrent", "throughput", "async", "sweep", "replay"],
     "Profile type identifiers for polymorphic deserialization",
 ]
 
@@ -328,6 +332,121 @@ def next_strategy(
         return SynchronousStrategy()
 
 
+@Profile.register("replay")
+class ReplayProfile(Profile):
+    """
+    Replay a trace file:
+    schedule each request at start_time + time_scale * relative_timestamp[i].
+
+    For this profile, the ``rate`` argument is interpreted as time_scale (scale factor
+    applied to relative timestamps), not as requests per second.
+
+    When ``constraints["max_requests"]`` is set, the trace is truncated at load time:
+    only the first max_requests rows are loaded from the file for both timestamps (here)
+    and request data (in the data loader). This keeps timestamps and requests aligned.
+    The trace file is read twice: once by the data pipeline for request payloads, and
+    once here for relative timestamps.
+    """
+
+    type_: Literal["replay"] = "replay"  # type: ignore[assignment]
+    relative_timestamps: list[float] = Field(
+        description="Request start times relative to first event (first = 0)",
+    )
+    time_scale: float = Field(
+        default=1.0,
+        gt=0,
+        description="Scale factor applied to relative timestamps",
+    )
+    max_seconds_filter: float | None = Field(
+        default=None,
+        description=(
+            "Original max_seconds value used as a load-time filter "
+            "(not a runtime constraint)"
+        ),
+    )
+
+    @classmethod
+    def resolve_args(
+        cls,
+        rate_type: str,
+        rate: list[float] | None,
+        random_seed: int,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        _ = (rate_type, random_seed)  # unused
+        data = kwargs.get("data")
+        if not data or not data[0]:
+            raise ValueError("Replay profile requires data (path to trace file)")
+        path = Path(data[0]) if isinstance(data[0], str) else data[0]
+        if not path.exists():
+            raise ValueError(f"Replay trace file not found: {path}")
+        constraints = kwargs.get("constraints") or {}
+        max_requests = constraints.get("max_requests")
+        if max_requests is not None and max_requests < 1:
+            raise ValueError(
+                "max_requests must be >= 1 when set for replay profile, "
+                f"got {max_requests}"
+            )
+
+        # For replay profile, rate is interpreted as time_scale (not requests per
+        # second)
+        time_scale = rate[0] if rate and len(rate) > 0 else 1.0
+
+        # Load all timestamps first (max_requests applied after max_seconds filtering)
+        relative_timestamps = load_relative_timestamps(path)
+
+        # Filter by max_seconds (applied in simulated time via time_scale)
+        max_seconds = constraints.get("max_seconds")
+        if max_seconds is not None and max_seconds > 0:
+            relative_timestamps = [
+                ts for ts in relative_timestamps if ts * time_scale <= max_seconds
+            ]
+
+        # Truncate by max_requests on top of any max_seconds filtering
+        if max_requests is not None:
+            relative_timestamps = relative_timestamps[:max_requests]
+
+        if not relative_timestamps:
+            raise ValueError(
+                "No timestamps remain after applying max_seconds and max_requests "
+                "filters. The trace is empty or all events were filtered out."
+            )
+
+        # Set max_requests to the actual count after filtering to prevent benchmark hang
+        # and eliminate race conditions between request completion and injection.
+        constraints["max_requests"] = len(relative_timestamps)
+
+        # Remove max_seconds to avoid runtime MaxDurationConstraint canceling
+        # in-flight requests
+        constraints.pop("max_seconds", None)
+
+        return {
+            "relative_timestamps": relative_timestamps,
+            "time_scale": time_scale,
+            "constraints": constraints,
+            "max_seconds_filter": max_seconds if max_seconds and max_seconds > 0
+            else None,
+        }
+
+    @property
+    def strategy_types(self) -> list[str]:
+        return ["trace"]
+
+    def next_strategy(
+        self,
+        prev_strategy: SchedulingStrategy | None,
+        prev_benchmark: Benchmark | None,
+    ) -> TraceReplayStrategy | None:
+        _ = prev_benchmark
+        # Replay has a single strategy; return it once, then None
+        if prev_strategy is not None:
+            return None
+        return TraceReplayStrategy(
+            relative_timestamps=self.relative_timestamps,
+            time_scale=self.time_scale,
+        )
+
+
 @Profile.register("concurrent")
 class ConcurrentProfile(Profile):
     """
diff --git a/src/guidellm/data/deserializers/__init__.py b/src/guidellm/data/deserializers/__init__.py
index fb22fd2a7..4fdfbceae 100644
--- a/src/guidellm/data/deserializers/__init__.py
+++ b/src/guidellm/data/deserializers/__init__.py
@@ -25,6 +25,7 @@
     SyntheticTextDataset,
     SyntheticTextDatasetDeserializer,
 )
+from .trace_synthetic import TraceSyntheticDatasetDeserializer
 
 __all__ = [
     "ArrowFileDatasetDeserializer",
@@ -46,4 +47,5 @@
     "SyntheticTextDatasetDeserializer",
     "TarFileDatasetDeserializer",
     "TextFileDatasetDeserializer",
+    "TraceSyntheticDatasetDeserializer",
 ]
diff --git a/src/guidellm/data/deserializers/trace_synthetic.py b/src/guidellm/data/deserializers/trace_synthetic.py
new file mode 100644
index 000000000..fe366f69d
--- /dev/null
+++ b/src/guidellm/data/deserializers/trace_synthetic.py
@@ -0,0 +1,159 @@
+"""
+Trace file deserializer that generates synthetic prompts per row.
+
+Reads a trace file (timestamp, input_length, output_length) and yields one row per
+line with a synthetic prompt matching the requested input_length for replay benchmarks.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+
+from datasets import Dataset
+from faker import Faker
+from transformers import PreTrainedTokenizerBase
+
+from guidellm.data.deserializers.deserializer import (
+    DataNotSupportedError,
+    DatasetDeserializer,
+    DatasetDeserializerFactory,
+)
+from guidellm.data.trace_io import load_trace_rows
+
+__all__ = ["TraceSyntheticDatasetDeserializer"]
+
+
+def _create_prompt(
+    processor: PreTrainedTokenizerBase,
+    prompt_tokens_count: int,
+    faker: Faker,
+    unique: str = "",
+) -> str:
+    """Generate text that tokenizes to exactly prompt_tokens_count tokens."""
+    prompt_token_ids: list[int] = []
+    avg_chars_per_token = 5
+    margin_of_safety = 1.5
+    attempts = 0
+
+    while len(prompt_token_ids) < prompt_tokens_count:
+        attempts += 1
+        num_chars = int(
+            prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
+        )
+        text = unique + faker.text(max_nb_chars=num_chars)
+        prompt_token_ids = processor.encode(text)
+
+    decoded = processor.decode(
+        prompt_token_ids[:prompt_tokens_count], skip_special_tokens=True
+    )
+    if isinstance(decoded, list):
+        return decoded[0] if decoded else ""
+    return decoded
+
+
+def _load_trace_rows(
+    path: Path,
+    timestamp_column: str,
+    prompt_tokens_column: str,
+    output_tokens_column: str,
+    max_rows: int | None = None,
+) -> list[dict[str, Any]]:
+    """Load trace file into list of dicts with timestamp, prompt_tokens,
+    output_tokens."""
+    try:
+        raw = load_trace_rows(
+            path,
+            required_columns=[
+                timestamp_column,
+                prompt_tokens_column,
+                output_tokens_column,
+            ],
+            max_rows=max_rows,
+        )
+    except (KeyError, ValueError) as e:
+        raise DataNotSupportedError(str(e)) from e
+    return [
+        {
+            "timestamp": float(row[timestamp_column]),
+            "prompt_tokens": int(row[prompt_tokens_column]),
+            "output_tokens": int(row[output_tokens_column]),
+        }
+        for row in raw
+    ]
+
+
+@DatasetDeserializerFactory.register("trace_synthetic")
+class TraceSyntheticDatasetDeserializer(DatasetDeserializer):
+    """
+    Load a trace file and generate a synthetic prompt per row.
+
+    Trace file must have timestamp, and columns for prompt and output token counts
+    (default: input_length, output_length). Each row becomes one request with
+    a synthetic prompt of the requested input length.
+    """
+
+    def __call__(
+        self,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        if (
+            not isinstance(data, str | Path)
+            or not (path := Path(data)).exists()
+            or not path.is_file()
+        ):
+            raise DataNotSupportedError(
+                "TraceSyntheticDatasetDeserializer expects a path to a trace file, "
+                f"got {data}"
+            )
+        timestamp_column = str(data_kwargs.pop("timestamp_column", "timestamp"))
+        prompt_tokens_column = str(
+            data_kwargs.pop("prompt_tokens_column", "input_length")
+        )
+        output_tokens_column = str(
+            data_kwargs.pop("output_tokens_column", "output_length")
+        )
+        max_rows_val = data_kwargs.pop("max_rows", None)
+        max_rows: int | None = None
+        if max_rows_val is not None:
+            if isinstance(max_rows_val, int):
+                max_rows = max_rows_val
+            elif isinstance(max_rows_val, str):
+                max_rows = int(max_rows_val)
+
+        rows = _load_trace_rows(
+            path, timestamp_column, prompt_tokens_column, output_tokens_column, max_rows
+        )
+        if not rows:
+            raise DataNotSupportedError("Trace file is empty")
+
+        processor = processor_factory()
+        faker = Faker()
+        faker.seed_instance(random_seed)
+
+        prompts: list[str] = []
+        prompt_tokens_counts: list[int] = []
+        output_tokens_counts: list[int] = []
+        for i, row in enumerate(rows):
+            n_in = row["prompt_tokens"]
+            n_out = row["output_tokens"]
+            prompt = _create_prompt(processor, n_in, faker, unique=f"{i} ")
+            prompts.append(prompt)
+            prompt_tokens_counts.append(n_in)
+            output_tokens_counts.append(n_out)
+
+        # Avoid passing deserializer-only keys to Dataset.from_dict
+        data_kwargs.pop("type_", None)
+
+        return Dataset.from_dict(
+            {
+                "prompt": prompts,
+                "prompt_tokens_count": prompt_tokens_counts,
+                "output_tokens_count": output_tokens_counts,
+            },
+            **data_kwargs,
+        )
diff --git a/src/guidellm/data/trace_io.py b/src/guidellm/data/trace_io.py
new file mode 100644
index 000000000..373d5f355
--- /dev/null
+++ b/src/guidellm/data/trace_io.py
@@ -0,0 +1,92 @@
+"""
+Shared trace file I/O for replay benchmarks.
+
+Reads trace files (.jsonl only for now) and exposes raw rows or relative timestamps.
+Used by the scheduler (load_relative_timestamps) and the trace_synthetic deserializer
+(load_trace_rows with token columns).
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+__all__ = ["load_relative_timestamps", "load_trace_rows"]
+
+
+def load_trace_rows(
+    path: Path | str,
+    required_columns: list[str] | None = None,
+    max_rows: int | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Load trace file rows as a list of dicts.
+
+    Supports .jsonl only (one JSON object per line).
+    If required_columns is set, every row must contain these keys; otherwise
+    KeyError is raised with a descriptive message.
+    If max_rows is set, only the first max_rows rows are loaded (for replay
+    with a request limit).
+
+    :param path: Path to the trace file.
+    :param required_columns: Optional list of column/field names that each row
+        must have.
+    :param max_rows: Optional maximum number of rows to load; None means load all.
+        If set to a value less than 1, returns an empty list.
+    :return: List of row dicts (keys and values as in the file).
+    :raises KeyError: If a required column is missing in the file or in a row.
+    :raises ValueError: If the file format is not .jsonl.
+    """
+    path = Path(path)
+    if max_rows is not None and max_rows < 1:
+        return []
+    suffix = path.suffix.lower()
+    if suffix != ".jsonl":
+        raise ValueError(f"Unsupported trace file format: {suffix}")
+
+    rows: list[dict[str, Any]] = []
+    with path.open(encoding="utf-8") as f:
+        for raw_line in f:
+            if max_rows is not None and len(rows) >= max_rows:
+                break
+            line = raw_line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if not isinstance(row, dict):
+                continue
+            if required_columns:
+                missing = [c for c in required_columns if c not in row]
+                if missing:
+                    raise KeyError(f"Trace row missing required columns: {missing}")
+            rows.append(row)
+
+    return rows
+
+
+def load_relative_timestamps(
+    path: Path | str,
+    timestamp_column: str = "timestamp",
+) -> list[float]:
+    """
+    Load timestamps from a trace file and return times relative to the first event.
+
+    Trace file must be JSONL (one JSON object per line). Timestamps are sorted
+    chronologically before calculating relative times. The earliest timestamp
+    becomes 0.0, and all others are relative to it (always >= 0).
+
+    :param path: Path to the trace file.
+    :param timestamp_column: Name of the column/field containing the timestamp.
+    :return: List of relative timestamps in seconds (first is 0.0, always sorted).
+    :raises ValueError: If the trace file is empty or has no valid rows.
+    """
+    raw = load_trace_rows(
+        path,
+        required_columns=[timestamp_column],
+    )
+    timestamps = sorted([float(row[timestamp_column]) for row in raw])
+    if not timestamps:
+        raise ValueError(f"Trace file has no valid rows: {path}")
+    t0 = timestamps[0]
+    return [t - t0 for t in timestamps]
diff --git a/src/guidellm/scheduler/__init__.py b/src/guidellm/scheduler/__init__.py
index 3aa6b5a70..c772f0ff5 100644
--- a/src/guidellm/scheduler/__init__.py
+++ b/src/guidellm/scheduler/__init__.py
@@ -50,6 +50,8 @@
     StrategyType,
     SynchronousStrategy,
     ThroughputStrategy,
+    TraceReplayStrategy,
+    load_relative_timestamps,
 )
 from .worker import WorkerProcess
 from .worker_group import WorkerProcessGroup
@@ -90,7 +92,9 @@
     "StrategyType",
     "SynchronousStrategy",
     "ThroughputStrategy",
+    "TraceReplayStrategy",
     "UnserializableConstraintInitializer",
     "WorkerProcess",
     "WorkerProcessGroup",
+    "load_relative_timestamps",
 ]
diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
index ff8e76a4c..c3c5714c5 100644
--- a/src/guidellm/scheduler/strategies.py
+++ b/src/guidellm/scheduler/strategies.py
@@ -26,6 +26,7 @@
 
 from pydantic import Field, NonNegativeFloat, NonNegativeInt, PositiveInt, PrivateAttr
 
+from guidellm.data.trace_io import load_relative_timestamps
 from guidellm.schemas import PydanticClassRegistryMixin, RequestInfo
 from guidellm.utils.mixins import InfoMixin
 
@@ -38,11 +39,13 @@
     "StrategyType",
     "SynchronousStrategy",
     "ThroughputStrategy",
+    "TraceReplayStrategy",
+    "load_relative_timestamps",
 ]
 
 
 StrategyType = Annotated[
-    Literal["synchronous", "concurrent", "throughput", "constant", "poisson"],
+    Literal["synchronous", "concurrent", "throughput", "constant", "poisson", "trace"],
     "Valid strategy type identifiers for scheduling request patterns",
 ]
 
@@ -671,3 +674,53 @@ def request_completed(self, request_info: RequestInfo):
         :param request_info: Completed request metadata (unused)
         """
         _ = request_info  # request_info unused for async poisson strategy
+
+
+@SchedulingStrategy.register("trace")
+class TraceReplayStrategy(SchedulingStrategy):
+    """
+    Replay scheduling from a trace of timestamps.
+
+    Schedules each request at start_time + time_scale * relative_timestamp[i],
+    so the trace's inter-arrival pattern is reproduced with an optional time scale.
+    """
+
+    type_: Literal["trace"] = "trace"  # type: ignore[assignment]
+    relative_timestamps: list[float] = Field(
+        description="Request start times relative to first event (first = 0)",
+    )
+    time_scale: float = Field(
+        default=1.0,
+        gt=0,
+        description="Scale factor applied to relative timestamps",
+    )
+
+    def __str__(self) -> str:
+        return f"trace@{self.time_scale:.2f}"
+
+    @property
+    def processes_limit(self) -> PositiveInt | None:
+        return None
+
+    @property
+    def requests_limit(self) -> PositiveInt | None:
+        # Cap concurrency to the trace length so workers never hold more
+        # semaphore slots than there are items to process.
+        return len(self.relative_timestamps) if self.relative_timestamps else None
+
+    async def next_request_time(self, worker_index: NonNegativeInt) -> float:
+        _ = worker_index
+        start_time = await self.get_processes_start_time()
+        if not self.relative_timestamps:
+            return start_time
+
+        idx = self.next_request_index()
+        if idx > len(self.relative_timestamps):
+            # Trace exhausted: signal the worker to wait for constraint_reached_event.
+            # math.inf tells the worker the trace is done; it will wait for the
+            # constraint to be reached instead of scheduling more requests.
+            return math.inf
+        return start_time + self.time_scale * self.relative_timestamps[idx - 1]
+
+    def request_completed(self, request_info: RequestInfo):
+        _ = request_info
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
new file mode 100644
index 000000000..996abb081
--- /dev/null
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -0,0 +1,409 @@
+## WRITTEN BY AI ##
+
+"""
+Unit tests for ReplayProfile.
+
+Ensures replay profile loads trace timestamps and creates TraceReplayStrategy with
+orrect time_scale.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from guidellm.benchmark.profiles import Profile, ReplayProfile
+from guidellm.scheduler import TraceReplayStrategy
+
+
+def _trace_path(tmp_path: Path, lines: list[str]) -> Path:
+    """Write JSONL lines to a trace file and return its path."""
+    path = tmp_path / "trace.jsonl"
+    path.write_text("\n".join(lines))
+    return path
+
+
+class TestReplayProfile:
+    """Tests for ReplayProfile."""
+
+    @pytest.mark.smoke
+    def test_resolve_args_requires_data(self):
+        """resolve_args raises when data is missing."""
+        with pytest.raises(ValueError, match="Replay profile requires data"):
+            ReplayProfile.resolve_args(
+                rate_type="replay",
+                rate=[1.0],
+                random_seed=42,
+            )
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("trace_lines", "rate", "expected_ts", "expected_scale"),
+        [
+            # Basic trace
+            (
+                [
+                    '{"timestamp": 0, "input_length": 10, "output_length": 5}',
+                    '{"timestamp": 0.5, "input_length": 20, "output_length": 10}',
+                ],
+                [2.0],
+                [0.0, 0.5],
+                2.0,
+            ),
+            # High token counts (8K-128K contexts)
+            (
+                [
+                    '{"timestamp": 0, "input_length": 8192, "output_length": 1024}',
+                    '{"timestamp": 0.5, "input_length": 32768, "output_length": 4096}',
+                    '{"timestamp": 1.0, "input_length": 131072,'
+                    '"output_length": 16384}',
+                ],
+                [1.0],
+                [0.0, 0.5, 1.0],
+                1.0,
+            ),
+            # Unsorted timestamps (sorted chronologically, all >= 0)
+            (
+                [
+                    '{"timestamp": 5.0, "input_length": 100, "output_length": 10}',
+                    '{"timestamp": 2.0, "input_length": 200, "output_length": 20}',
+                    '{"timestamp": 8.0, "input_length": 300, "output_length": 30}',
+                ],
+                [1.0],
+                [0.0, 3.0, 6.0],  # Sorted: 2.0, 5.0, 8.0 -> 0.0, 3.0, 6.0
+                1.0,
+            ),
+            # Duplicate timestamps (concurrent burst)
+            (
+                [
+                    '{"timestamp": 1.0, "input_length": 100, "output_length": 10}',
+                    '{"timestamp": 1.0, "input_length": 200, "output_length": 20}',
+                    '{"timestamp": 1.0, "input_length": 300, "output_length": 30}',
+                    '{"timestamp": 2.5, "input_length": 400, "output_length": 40}',
+                ],
+                [2.0],
+                [0.0, 0.0, 0.0, 1.5],
+                2.0,
+            ),
+            # High-frequency trace (millisecond-scale)
+            (
+                [
+                    '{"timestamp": 0.000, "input_length": 100, "output_length": 10}',
+                    '{"timestamp": 0.001, "input_length": 200, "output_length": 20}',
+                    '{"timestamp": 0.002, "input_length": 300, "output_length": 30}',
+                    '{"timestamp": 0.003, "input_length": 400, "output_length": 40}',
+                ],
+                [1.0],
+                [0.0, 0.001, 0.002, 0.003],
+                1.0,
+            ),
+        ],
+    )
+    def test_resolve_args_and_create_with_trace(
+        self, tmp_path: Path, trace_lines, rate, expected_ts, expected_scale
+    ):
+        """resolve_args loads trace; Profile.create returns ReplayProfile with
+        correct time_scale."""
+        trace = _trace_path(tmp_path, trace_lines)
+        out = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=rate,
+            random_seed=42,
+            data=[str(trace)],
+        )
+        assert out["relative_timestamps"] == pytest.approx(expected_ts, abs=1e-9)
+        assert out["time_scale"] == expected_scale
+        profile = Profile.create(
+            rate_type="replay",
+            rate=rate,
+            random_seed=42,
+            data=[str(trace)],
+        )
+        assert isinstance(profile, ReplayProfile)
+        assert profile.relative_timestamps == pytest.approx(expected_ts, abs=1e-9)
+        assert profile.time_scale == expected_scale
+
+    @pytest.mark.smoke
+    def test_next_strategy_returns_trace_then_none(self, tmp_path: Path):
+        """next_strategy returns TraceReplayStrategy then None."""
+        trace = _trace_path(
+            tmp_path,
+            ['{"timestamp": 0, "input_length": 1, "output_length": 1}'],
+        )
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+        )
+        profile = ReplayProfile(**kwargs)
+        assert profile.strategy_types == ["trace"]
+        s1 = profile.next_strategy(None, None)
+        assert isinstance(s1, TraceReplayStrategy)
+        assert s1.relative_timestamps == [0.0]
+        assert s1.time_scale == 1.0
+        assert profile.next_strategy(s1, None) is None
+
+    @pytest.mark.smoke
+    def test_max_requests_less_than_one_raises(self, tmp_path: Path):
+        """max_requests < 1 in constraints raises ValueError."""
+        trace = _trace_path(
+            tmp_path,
+            ['{"timestamp": 0, "input_length": 1, "output_length": 1}'],
+        )
+        with pytest.raises(ValueError, match="max_requests must be >= 1"):
+            ReplayProfile.resolve_args(
+                rate_type="replay",
+                rate=[1.0],
+                random_seed=42,
+                data=[str(trace)],
+                constraints={"max_requests": 0},
+            )
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("trace_lines", "max_req", "expected_ts"),
+        [
+            # Basic truncation
+            (
+                [
+                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                    '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
+                    '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
+                    '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
+                ],
+                2,
+                [0.0, 1.0],
+            ),
+            # Truncate concurrent burst (first 2 of 5 same-timestamp requests)
+            (
+                [
+                    '{"timestamp": 1.0, "input_length": 100, "output_length": 10}',
+                    '{"timestamp": 1.0, "input_length": 200, "output_length": 20}',
+                    '{"timestamp": 1.0, "input_length": 300, "output_length": 30}',
+                    '{"timestamp": 1.0, "input_length": 400, "output_length": 40}',
+                    '{"timestamp": 1.0, "input_length": 500, "output_length": 50}',
+                ],
+                2,
+                [0.0, 0.0],
+            ),
+            # Truncate after sorting (all rows loaded, sorted, then truncated)
+            # File order: 5.0, 2.0, 8.0, 1.0 -> sorted: 1.0, 2.0, 5.0, 8.0
+            # Relative: 0.0, 1.0, 4.0, 7.0 -> truncated to 3: 0.0, 1.0, 4.0
+            (
+                [
+                    '{"timestamp": 5.0, "input_length": 100, "output_length": 10}',
+                    '{"timestamp": 2.0, "input_length": 200, "output_length": 20}',
+                    '{"timestamp": 8.0, "input_length": 300, "output_length": 30}',
+                    '{"timestamp": 1.0, "input_length": 400, "output_length": 40}',
+                ],
+                3,
+                [0.0, 1.0, 4.0],  # 1.0->0.0, 2.0->1.0, 5.0->4.0
+            ),
+        ],
+    )
+    def test_max_requests_truncates_timestamps(
+        self, tmp_path: Path, trace_lines, max_req, expected_ts
+    ):
+        """max_requests truncates timestamps to first N rows (handles
+        duplicates/unsorted)."""
+        trace = _trace_path(tmp_path, trace_lines)
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+            constraints={"max_requests": max_req},
+        )
+        assert kwargs["relative_timestamps"] == pytest.approx(expected_ts, abs=1e-9)
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("trace_lines", "rate", "max_seconds", "expected_ts"),
+        [
+            # Basic: time_scale=1.0, max_seconds=1.5 keeps timestamps <= 1.5
+            (
+                [
+                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                    '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
+                    '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
+                    '{"timestamp": 2.0, "input_length": 4, "output_length": 4}',
+                ],
+                [1.0],  # time_scale = 1.0
+                1.5,
+                [0.0, 0.5, 1.0],  # 2.0 * 1.0 = 2.0 > 1.5, so excluded
+            ),
+            # With time_scale=2.0: effective times are 0, 1.0, 2.0, 4.0
+            # max_seconds=1.5 keeps only timestamps where ts * 2.0 <= 1.5
+            (
+                [
+                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                    '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
+                    '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
+                    '{"timestamp": 2.0, "input_length": 4, "output_length": 4}',
+                ],
+                [2.0],  # time_scale = 2.0
+                1.5,
+                [0.0, 0.5],  # 1.0 * 2.0 = 2.0 > 1.5, so excluded
+            ),
+            # With time_scale=0.5 (speedup): effective times are 0, 0.25, 0.5, 1.0
+            # max_seconds=0.8 keeps only timestamps where ts * 0.5 <= 0.8
+            (
+                [
+                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                    '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
+                    '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
+                    '{"timestamp": 2.0, "input_length": 4, "output_length": 4}',
+                ],
+                [0.5],  # time_scale = 0.5
+                0.8,
+                [0.0, 0.5, 1.0],  # 2.0 * 0.5 = 1.0 > 0.8, so excluded
+            ),
+            # max_seconds larger than all timestamps: all kept
+            (
+                [
+                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                    '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
+                ],
+                [1.0],
+                10.0,
+                [0.0, 1.0],
+            ),
+            # max_seconds very small: only first timestamp kept
+            (
+                [
+                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                    '{"timestamp": 0.1, "input_length": 2, "output_length": 2}',
+                    '{"timestamp": 0.2, "input_length": 3, "output_length": 3}',
+                ],
+                [1.0],
+                0.05,
+                [0.0],
+            ),
+        ],
+    )
+    def test_max_seconds_filters_timestamps_with_time_scale(
+        self, tmp_path: Path, trace_lines, rate, max_seconds, expected_ts
+    ):
+        """max_seconds filters timestamps based on effective time (ts * time_scale)."""
+        trace = _trace_path(tmp_path, trace_lines)
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=rate,
+            random_seed=42,
+            data=[str(trace)],
+            constraints={"max_seconds": max_seconds},
+        )
+        assert kwargs["relative_timestamps"] == pytest.approx(expected_ts, abs=1e-9)
+        assert kwargs["time_scale"] == rate[0]
+
+    @pytest.mark.smoke
+    def test_max_seconds_with_max_requests_both_apply(self, tmp_path: Path):
+        """Both max_seconds and max_requests constraints apply (intersection)."""
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
+                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
+                '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
+                '{"timestamp": 4.0, "input_length": 5, "output_length": 5}',
+            ],
+        )
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+            constraints={"max_requests": 4, "max_seconds": 2.5},
+        )
+        # max_requests limits to first 4: [0, 1.0, 2.0, 3.0]
+        # Then max_seconds filters to <= 2.5: [0, 1.0, 2.0]
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0, 2.0], abs=1e-9)
+
+    @pytest.mark.smoke
+    def test_max_seconds_filters_and_sets_max_requests(self, tmp_path: Path):
+        """max_seconds filters timestamps at load time and max_requests is set to
+        the actual count to synchronize the data loader and prevent benchmark hang."""
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
+                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
+                '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
+                '{"timestamp": 4.0, "input_length": 5, "output_length": 5}',
+            ],
+        )
+        constraints: dict[str, Any] = {"max_seconds": 2.5}
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+            constraints=constraints,
+        )
+        # max_seconds=2.5 with time_scale=1.0 keeps ts <= 2.5: [0, 1.0, 2.0]
+        # = 3 requests
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0, 2.0], abs=1e-9)
+        # max_requests is always set to actual count after filtering
+        assert constraints.get("max_requests") == 3
+        # max_seconds is removed to avoid runtime constraint conflicts
+        assert "max_seconds" not in constraints
+
+    @pytest.mark.smoke
+    def test_max_requests_always_updated_to_actual_count(self, tmp_path: Path):
+        """max_requests is always set to the actual count of timestamps after
+        filtering."""
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
+                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
+                '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
+            ],
+        )
+        constraints: dict[str, Any] = {"max_requests": 2, "max_seconds": 10.0}
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+            constraints=constraints,
+        )
+        # max_requests=2 takes first 2 timestamps: [0, 1.0]
+        # max_seconds=10.0 keeps all (ts * 1.0 <= 10.0)
+        # Result: [0, 1.0] - but max_requests is always updated to actual count
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0], abs=1e-9)
+        # constraints['max_requests'] is always set to actual count after filtering
+        assert constraints.get("max_requests") == 2  # matches len(relative_timestamps)
+
+    @pytest.mark.smoke
+    def test_max_seconds_removed_from_constraints(self, tmp_path: Path):
+        """max_seconds is removed from constraints after load-time filtering."""
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
+                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
+            ],
+        )
+        constraints: dict[str, Any] = {"max_seconds": 1.5}
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+            constraints=constraints,
+        )
+        # max_seconds should be removed to avoid runtime MaxDurationConstraint
+        assert "max_seconds" not in constraints
+        assert kwargs["constraints"] is constraints
+        # Verify timestamps were filtered: ts <= 1.5 -> [0, 1.0]
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0], abs=1e-9)
+        # max_requests set to actual count
+        assert constraints.get("max_requests") == 2
diff --git a/tests/unit/data/deserializers/test_trace_synthetic.py b/tests/unit/data/deserializers/test_trace_synthetic.py
new file mode 100644
index 000000000..4c5462d24
--- /dev/null
+++ b/tests/unit/data/deserializers/test_trace_synthetic.py
@@ -0,0 +1,134 @@
+## WRITTEN BY AI ##
+
+"""
+Unit tests for TraceSyntheticDatasetDeserializer.
+
+Ensures trace file is loaded and synthetic prompts are generated with exact
+input_length.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import Mock
+
+import pytest
+from datasets import Dataset
+
+from guidellm.data.deserializers.trace_synthetic import (
+    TraceSyntheticDatasetDeserializer,
+)
+from guidellm.data.schemas import DataNotSupportedError
+
+
+def _mock_processor():
+    """Tokenizer that returns token count = number of words in text."""
+    proc = Mock()
+    proc.encode.side_effect = lambda text: list(range(max(1, len(text.split()))))
+    proc.decode.side_effect = lambda tokens, skip_special_tokens=False: " ".join(
+        "x" for _ in range(len(tokens))
+    )
+    return proc
+
+
+def _deserialize(deserializer, data, **kwargs):
+    defaults = {
+        "processor_factory": _mock_processor,
+        "random_seed": 42,
+    }
+    return deserializer(**{**defaults, "data": data, **kwargs})
+
+
+class TestTraceSyntheticDatasetDeserializer:
+    """Tests for TraceSyntheticDatasetDeserializer."""
+
+    @pytest.fixture
+    def deserializer(self):
+        return TraceSyntheticDatasetDeserializer()
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("content", "expected"),
+        [
+            # Basic small counts
+            (
+                '{"timestamp": 0, "input_length": 50, "output_length": 20}\n'
+                '{"timestamp": 0.5, "input_length": 100, "output_length": 30}\n',
+                [(50, 20), (100, 30)],
+            ),
+            # Production-scale token counts (4K-128K contexts)
+            (
+                '{"timestamp": 0, "input_length": 4096, "output_length": 512}\n'
+                '{"timestamp": 1.0, "input_length": 8192, "output_length": 1024}\n'
+                '{"timestamp": 2.0, "input_length": 32768, "output_length": 4096}\n'
+                '{"timestamp": 3.0, "input_length": 131072, "output_length": 8192}\n',
+                [(4096, 512), (8192, 1024), (32768, 4096), (131072, 8192)],
+            ),
+            # Mixed high/low alternating (edge cases)
+            (
+                '{"timestamp": 0, "input_length": 10, "output_length": 5}\n'
+                '{"timestamp": 0.1, "input_length": 65536, "output_length": 16384}\n'
+                '{"timestamp": 0.2, "input_length": 20, "output_length": 10}\n'
+                '{"timestamp": 0.3, "input_length": 131072, "output_length": 32768}\n',
+                [(10, 5), (65536, 16384), (20, 10), (131072, 32768)],
+            ),
+            # Unsorted timestamps with duplicates (preserves file order)
+            (
+                '{"timestamp": 5.0, "input_length": 100, "output_length": 10}\n'
+                '{"timestamp": 2.0, "input_length": 200, "output_length": 20}\n'
+                '{"timestamp": 8.0, "input_length": 300, "output_length": 30}\n'
+                '{"timestamp": 2.0, "input_length": 400, "output_length": 40}\n',
+                [(100, 10), (200, 20), (300, 30), (400, 40)],
+            ),
+            # Concurrent burst (5 requests at same timestamp)
+            (
+                '{"timestamp": 1.0, "input_length": 100, "output_length": 10}\n'
+                '{"timestamp": 1.0, "input_length": 200, "output_length": 20}\n'
+                '{"timestamp": 1.0, "input_length": 300, "output_length": 30}\n'
+                '{"timestamp": 1.0, "input_length": 400, "output_length": 40}\n'
+                '{"timestamp": 1.0, "input_length": 500, "output_length": 50}\n',
+                [(100, 10), (200, 20), (300, 30), (400, 40), (500, 50)],
+            ),
+        ],
+    )
+    def test_load_jsonl_various_scenarios(
+        self, tmp_path: Path, deserializer, content, expected
+    ):
+        """Trace JSONL yields exact token counts (small, large, mixed, unsorted,
+        duplicates)."""
+        trace = tmp_path / "trace.jsonl"
+        trace.write_text(content)
+        ds = _deserialize(deserializer, str(trace), type_="trace_synthetic")
+        assert isinstance(ds, Dataset)
+        assert len(ds) == len(expected)
+        assert set(ds.column_names) >= {
+            "prompt",
+            "prompt_tokens_count",
+            "output_tokens_count",
+        }
+        for row, (in_len, out_len) in zip(ds, expected, strict=True):
+            assert row["prompt_tokens_count"] == in_len
+            assert row["output_tokens_count"] == out_len
+
+    @pytest.mark.smoke
+    def test_rejects_invalid_data(self, deserializer):
+        """Non-path data raises DataNotSupportedError."""
+        with pytest.raises(DataNotSupportedError, match="path to a trace file"):
+            _deserialize(deserializer, 123)
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        ("content", "match"),
+        [
+            ("", "empty"),
+            ('{"ts": 0, "input_length": 10, "output_length": 5}\n', "timestamp"),
+        ],
+    )
+    def test_trace_validation_raises(
+        self, tmp_path: Path, deserializer, content, match
+    ):
+        """Empty trace or missing required column raises DataNotSupportedError."""
+        trace = tmp_path / "trace.jsonl"
+        trace.write_text(content)
+        with pytest.raises(DataNotSupportedError, match=match):
+            _deserialize(deserializer, str(trace))
diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
new file mode 100644
index 000000000..f1eb34ac7
--- /dev/null
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -0,0 +1,253 @@
+## WRITTEN BY AI ##
+
+"""
+Unit tests for trace replay strategy and load_relative_timestamps.
+
+Verifies that TraceReplayStrategy schedules requests at start_time + time_scale
+* relative_timestamp[i] and that load_relative_timestamps correctly parses trace
+files.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import math
+from pathlib import Path
+
+import pytest
+
+from guidellm.scheduler import (
+    SchedulingStrategy,
+    TraceReplayStrategy,
+    load_relative_timestamps,
+)
+from guidellm.schemas import RequestInfo
+
+
+def _write_trace(path: Path, content: str) -> Path:
+    path.write_text(content)
+    return path
+
+
+class TestLoadRelativeTimestamps:
+    """Tests for load_relative_timestamps helper."""
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("content", "kwargs", "expected"),
+        [
+            # Basic cases
+            (
+                '{"timestamp": 100, "input_length": 10}\n'
+                '{"timestamp": 100.5, "input_length": 20}\n'
+                '{"timestamp": 101.2, "input_length": 15}\n',
+                {"timestamp_column": "timestamp"},
+                [0.0, 0.5, 1.2],
+            ),
+            (
+                '{"ts": 0, "input_length": 1}\n{"ts": 2.5, "input_length": 2}\n',
+                {"timestamp_column": "ts"},
+                [0.0, 2.5],
+            ),
+            # High token counts (production-like: 2K-128K contexts)
+            (
+                '{"timestamp": 0, "input_length": 2048, "output_length": 512}\n'
+                '{"timestamp": 1.5, "input_length": 4096, "output_length": 1024}\n'
+                '{"timestamp": 3.0, "input_length": 8192, "output_length": 2048}\n'
+                '{"timestamp": 4.5, "input_length": 32768, "output_length": 8192}\n'
+                '{"timestamp": 6.0, "input_length": 131072, "output_length": 32768}\n',
+                {"timestamp_column": "timestamp"},
+                [0.0, 1.5, 3.0, 4.5, 6.0],
+            ),
+            # Unsorted timestamps (sorted chronologically, all >= 0)
+            (
+                '{"timestamp": 5.0, "input_length": 10}\n'
+                '{"timestamp": 2.0, "input_length": 20}\n'
+                '{"timestamp": 8.0, "input_length": 30}\n',
+                {"timestamp_column": "timestamp"},
+                [0.0, 3.0, 6.0],  # Sorted: 2.0, 5.0, 8.0 -> 0.0, 3.0, 6.0
+            ),
+            # Duplicate timestamps (concurrent burst)
+            (
+                '{"timestamp": 1.0, "input_length": 10}\n'
+                '{"timestamp": 1.0, "input_length": 20}\n'
+                '{"timestamp": 1.0, "input_length": 30}\n'
+                '{"timestamp": 2.5, "input_length": 40}\n',
+                {"timestamp_column": "timestamp"},
+                [0.0, 0.0, 0.0, 1.5],
+            ),
+        ],
+    )
+    def test_load_valid_jsonl(self, tmp_path: Path, content, kwargs, expected):
+        """Load JSONL trace and get sorted relative timestamps (basic, high counts,
+        unsorted, duplicates)."""
+        trace = tmp_path / "trace.jsonl"
+        _write_trace(trace, content)
+        out = load_relative_timestamps(trace, **kwargs)
+        assert out == pytest.approx(expected, abs=1e-9)
+
+    @pytest.mark.smoke
+    def test_empty_trace_raises(self, tmp_path: Path):
+        """Empty trace file raises ValueError."""
+        trace = tmp_path / "trace.jsonl"
+        _write_trace(trace, "")
+        with pytest.raises(ValueError, match="no valid rows"):
+            load_relative_timestamps(trace)
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("suffix", "content", "match"),
+        [
+            (
+                "json",
+                json.dumps(
+                    [
+                        {"timestamp": 0, "input_length": 1},
+                        {"timestamp": 1.0, "input_length": 2},
+                    ]
+                ),
+                r"Unsupported trace file format.*\.json",
+            ),
+            (
+                "csv",
+                "timestamp,input_length,output_length\n0,10,5\n0.3,20,10\n",
+                r"Unsupported trace file format.*\.csv",
+            ),
+            ("txt", "0\n1\n", "Unsupported trace file format"),
+        ],
+    )
+    def test_unsupported_format_raises(self, tmp_path: Path, suffix, content, match):
+        """JSON array, CSV, or unknown suffix raises ValueError."""
+        trace = tmp_path / f"trace.{suffix}"
+        _write_trace(trace, content)
+        with pytest.raises(ValueError, match=match):
+            load_relative_timestamps(trace)
+
+
+class TestTraceReplayStrategy:
+    """Tests for TraceReplayStrategy."""
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("timestamps", "time_scale"),
+        [
+            ([0.0, 0.5, 1.0], 2.0),
+            ([0.0, 1.0], 0.5),
+        ],
+    )
+    def test_initialization_and_str(self, timestamps, time_scale):
+        """Init, type_, optional str, and limits."""
+        strategy = TraceReplayStrategy(
+            relative_timestamps=timestamps,
+            time_scale=time_scale,
+        )
+        assert strategy.type_ == "trace"
+        assert strategy.relative_timestamps == timestamps
+        assert strategy.time_scale == time_scale
+        assert strategy.processes_limit is None
+        # requests_limit equals trace length to cap concurrency to available requests
+        assert strategy.requests_limit == len(timestamps)
+        if time_scale == 0.5:
+            assert str(strategy) == "trace@0.50"
+
+    @pytest.mark.smoke
+    def test_marshalling(self):
+        """Pydantic dump/load and polymorphic restore."""
+        strategy = TraceReplayStrategy(
+            relative_timestamps=[0.0, 1.0, 2.0],
+            time_scale=1.5,
+        )
+        data = strategy.model_dump()
+        assert data["type_"] == "trace"
+        assert data["relative_timestamps"] == [0.0, 1.0, 2.0]
+        assert data["time_scale"] == 1.5
+        reconstructed = TraceReplayStrategy.model_validate(data)
+        assert reconstructed.relative_timestamps == strategy.relative_timestamps
+        base = SchedulingStrategy.model_validate(data)
+        assert isinstance(base, TraceReplayStrategy)
+
+    @pytest.mark.smoke
+    def test_next_request_time_scaled_timestamps(self):
+        """next_request_time returns start_time + time_scale * relative_ts[i]."""
+        strategy = TraceReplayStrategy(
+            relative_timestamps=[0.0, 0.5, 1.0],
+            time_scale=2.0,
+        )
+        strategy.init_processes_timings(worker_count=1, max_concurrency=10)
+        strategy.init_processes_start(1000.0)
+        expected = [1000.0, 1001.0, 1002.0]
+
+        async def run():
+            for exp in expected:
+                t = await strategy.next_request_time(0)
+                assert t == pytest.approx(exp, abs=1e-6)
+
+        asyncio.run(run())
+
+    @pytest.mark.smoke
+    def test_next_request_time_beyond_trace_parks_worker(self):
+        """When index > len(relative_timestamps), return math.inf to park the slot.
+
+        Returning math.inf causes the worker to sleep indefinitely until
+        constraint_reached_event cancels it, preventing it from racing the
+        messaging queue with a stale target timestamp.
+        """
+        strategy = TraceReplayStrategy(
+            relative_timestamps=[0.0, 1.0],
+            time_scale=1.0,
+        )
+        strategy.init_processes_timings(worker_count=1, max_concurrency=10)
+        strategy.init_processes_start(500.0)
+
+        async def run():
+            await strategy.next_request_time(0)
+            await strategy.next_request_time(0)
+            t3 = await strategy.next_request_time(0)
+            assert t3 == math.inf
+
+        asyncio.run(run())
+
+    @pytest.mark.smoke
+    def test_request_completed_no_op(self):
+        """request_completed is a no-op."""
+        strategy = TraceReplayStrategy(relative_timestamps=[0.0], time_scale=1.0)
+        info = RequestInfo(
+            request_id="x",
+            status="completed",
+            scheduler_process_id=0,
+            scheduler_start_time=0,
+        )
+        strategy.request_completed(info)
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        ("timestamps", "expected"),
+        [
+            # Concurrent burst: 3 requests at same time
+            ([0.0, 0.0, 0.0, 1.0, 2.0], [1000.0, 1000.0, 1000.0, 1001.0, 1002.0]),
+            # Unsorted timestamps (sorted by load_relative_timestamps, all >= 0)
+            ([0.0, 3.0, 5.0, 6.0], [1000.0, 1003.0, 1005.0, 1006.0]),
+            # High frequency burst (millisecond scale)
+            (
+                [0.0, 0.001, 0.002, 0.003, 0.004],
+                [1000.0, 1000.001, 1000.002, 1000.003, 1000.004],
+            ),
+        ],
+    )
+    def test_scheduling_patterns(self, timestamps, expected):
+        """Test concurrent bursts, unsorted timestamps (now sorted), and high-frequency
+        patterns."""
+        strategy = TraceReplayStrategy(
+            relative_timestamps=timestamps,
+            time_scale=1.0,
+        )
+        strategy.init_processes_timings(worker_count=3, max_concurrency=10)
+        strategy.init_processes_start(1000.0)
+
+        async def run():
+            for exp in expected:
+                t = await strategy.next_request_time(0)
+                assert t == pytest.approx(exp, abs=1e-6)
+
+        asyncio.run(run())

From a9572997d71e87515d8739e3437f963ad1616e62 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Wed, 11 Mar 2026 23:45:18 +0100
Subject: [PATCH 02/27] fix the CI mypy

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/entrypoints.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index 1cce293e8..f623a7ab3 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -550,12 +550,14 @@ async def benchmark_generative_text(
             else args.max_requests
         )
         request_loader = await resolve_request_loader(
-            **loader_kwargs, max_requests=effective_max_requests
-        )  # type: ignore[arg-type]
+            **loader_kwargs,  # type: ignore[arg-type,misc]
+            max_requests=effective_max_requests,  # type: ignore[arg-type]
+        )
     else:
         request_loader = await resolve_request_loader(
-            **loader_kwargs, max_requests=args.max_requests
-        )  # type: ignore[arg-type]
+            **loader_kwargs,  # type: ignore[arg-type,misc]
+            max_requests=args.max_requests,  # type: ignore[arg-type]
+        )
         profile = await resolve_profile(**profile_kwargs, data=None)  # type: ignore[arg-type]
 
     warmup = TransientPhaseConfig.create_from_value(args.warmup)

From b557fe1b67b8d06901ed05b604cfb89c40d85646 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sun, 15 Mar 2026 23:54:08 +0100
Subject: [PATCH 03/27] add e2e tests

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 tests/e2e/test_successful_benchmark.py | 188 +++++++++++++++++++++++++
 tests/e2e/utils.py                     |  12 +-
 2 files changed, 194 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/test_successful_benchmark.py b/tests/e2e/test_successful_benchmark.py
index 8703882b2..85ada36e6 100644
--- a/tests/e2e/test_successful_benchmark.py
+++ b/tests/e2e/test_successful_benchmark.py
@@ -120,3 +120,191 @@ def test_max_requests_benchmark(server: VllmSimServer, tmp_path: Path):
         f"Expected {max_requests} successful requests, got {len(successful_requests)}"
     )
     assert_successful_requests_fields(successful_requests)
+
+
+@pytest.mark.timeout(30)
+@pytest.mark.sanity
+def test_replay_profile_benchmark(server: VllmSimServer, tmp_path: Path):
+    """
+    Test trace replay profile with a simple trace file.
+    Validates that requests are replayed with correct timing from trace.
+    Also tests time_scale (rate) functionality.
+    """
+    report_name = "replay_benchmarks.json"
+    report_path = tmp_path / report_name
+
+    # Create trace file with 5 requests at 0.05s intervals
+    trace_file = _create_trace_file(tmp_path, num_requests=5, interval=0.05)
+
+    # Create and configure the guidellm client with replay profile
+    client = GuidellmClient(
+        target=server.get_url(),
+        output_dir=tmp_path,
+        outputs=report_name,
+    )
+
+    # Start the benchmark with replay profile
+    # rate=2.0 means time_scale=2.0 (timestamps multiplied by 2)
+    client.start_benchmark(
+        profile="replay",
+        rate=2.0,
+        max_requests=5,
+        data=str(trace_file),
+        processor="gpt2",
+    )
+
+    # Wait for the benchmark to complete
+    client.wait_for_completion(timeout=30)
+
+    # Assert no Python exceptions occurred
+    assert_no_python_exceptions(client.stderr)
+
+    # Load and validate the report
+    report = load_benchmark_report(report_path)
+    assert len(report["benchmarks"]) == 1
+
+    benchmark = report["benchmarks"][0]
+
+    # Validate successful requests have all expected fields
+    successful_requests = benchmark["requests"]["successful"]
+    assert len(successful_requests) == 5, (
+        f"Expected 5 successful requests, got {len(successful_requests)}"
+    )
+    assert_successful_requests_fields(successful_requests)
+
+    # Verify scheduler state shows correct request count
+    assert "scheduler_state" in benchmark
+    scheduler_state = benchmark["scheduler_state"]
+    assert scheduler_state["processed_requests"] == 5
+
+
+@pytest.mark.timeout(30)
+@pytest.mark.sanity
+def test_replay_profile_max_requests_stronger_than_max_seconds(
+    server: VllmSimServer, tmp_path: Path
+):
+    """
+    Test replay profile where max_requests is the limiting constraint.
+    Trace has 20 requests over 2 seconds, but max_requests=5 limits to 5.
+    max_seconds=10 is not reached because max_requests triggers first.
+    """
+    report_name = "replay_max_requests_stronger.json"
+    report_path = tmp_path / report_name
+
+    # Create trace with 20 requests at 0.1s intervals (total 1.9s)
+    trace_file = _create_trace_file(tmp_path, num_requests=20, interval=0.1)
+
+    client = GuidellmClient(
+        target=server.get_url(),
+        output_dir=tmp_path,
+        outputs=report_name,
+    )
+
+    # max_requests=5 should be the limiting constraint
+    # max_seconds=10 should NOT be reached
+    client.start_benchmark(
+        profile="replay",
+        rate=1.0,
+        max_requests=5,
+        max_seconds=10,  # Very high, won't be reached
+        data=str(trace_file),
+        processor="gpt2",
+    )
+
+    client.wait_for_completion(timeout=30)
+    assert_no_python_exceptions(client.stderr)
+
+    report = load_benchmark_report(report_path)
+    benchmark = report["benchmarks"][0]
+
+    # Should only have 5 requests (max_requests won)
+    successful_requests = benchmark["requests"]["successful"]
+    assert len(successful_requests) == 5, (
+        f"Expected 5 requests (max_requests limit), got {len(successful_requests)}"
+    )
+
+    # Verify max_requests constraint was triggered
+    assert_constraint_triggered(benchmark, "max_requests", {"processed_exceeded": True})
+
+
+@pytest.mark.timeout(30)
+@pytest.mark.sanity
+def test_replay_profile_max_seconds_stronger_than_max_requests(
+    server: VllmSimServer, tmp_path: Path
+):
+    """
+    Test replay profile where max_seconds is the limiting constraint.
+    Trace has 20 requests over 2 seconds, but max_seconds=0.3 limits to ~3 requests.
+    max_requests=10 is not reached because max_seconds triggers first.
+    """
+    report_name = "replay_max_seconds_stronger.json"
+    report_path = tmp_path / report_name
+
+    # Create trace with 20 requests at 0.1s intervals
+    # With time_scale=1.0, timestamps are: 0.0, 0.1, 0.2, 0.3, 0.4, ...
+    # max_seconds=0.25 should include: 0.0, 0.1, 0.2 (3 requests, 0.3 > 0.25)
+    trace_file = _create_trace_file(tmp_path, num_requests=20, interval=0.1)
+
+    client = GuidellmClient(
+        target=server.get_url(),
+        output_dir=tmp_path,
+        outputs=report_name,
+    )
+
+    # max_seconds=0.25 should be the limiting constraint
+    # Only timestamps <= 0.25 should be kept: 0.0, 0.1, 0.2
+    client.start_benchmark(
+        profile="replay",
+        rate=1.0,
+        max_requests=10,  # High, won't be reached
+        max_seconds=0.25,
+        data=str(trace_file),
+        processor="gpt2",
+    )
+
+    client.wait_for_completion(timeout=30)
+    assert_no_python_exceptions(client.stderr)
+
+    report = load_benchmark_report(report_path)
+    benchmark = report["benchmarks"][0]
+
+    # Should have 3 requests (0.0, 0.1, 0.2 where 0.2 <= 0.25)
+    successful_requests = benchmark["requests"]["successful"]
+    assert len(successful_requests) == 3, (
+        f"Expected 3 requests (max_seconds=0.25 filter), got {len(successful_requests)}"
+    )
+
+    # Verify max_requests constraint was triggered
+    # (max_seconds is converted to max_requests internally)
+    assert_constraint_triggered(benchmark, "max_requests", {"processed_exceeded": True})
+
+
+# Helper functions for trace file creation
+
+
+def _create_trace_file(
+    tmp_path: Path, num_requests: int = 5, interval: float = 0.1
+) -> Path:
+    """Create a trace file with evenly spaced timestamps for testing."""
+    trace_file = tmp_path / "trace.jsonl"
+    lines = [
+        f'{{"timestamp": {i * interval}, '
+        f'"input_length": {10 * (i + 1)}, '
+        f'"output_length": {5 * (i + 1)}}}'
+        for i in range(num_requests)
+    ]
+    trace_file.write_text("\n".join(lines))
+    return trace_file
+
+
+def _create_burst_trace_file(tmp_path: Path, num_requests: int = 10) -> Path:
+    """Create a trace file with all requests at the same timestamp."""
+    trace_file = tmp_path / "trace_burst.jsonl"
+    lines = [
+        f'{{"timestamp": 0.0, '
+        f'"input_length": {20 * (i + 1)}, '
+        f'"output_length": {10 * (i + 1)}}}'
+        for i in range(num_requests)
+    ]
+    trace_file.write_text("\n".join(lines))
+    return trace_file
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 55baa89d2..03f039ef2 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -45,12 +45,12 @@ def __init__(
     def start_benchmark(
         self,
         profile: str = "constant",
-        rate: int = 10,
-        max_seconds: int | None = None,
+        rate: int | float = 10,
+        max_seconds: int | float | None = None,
         max_requests: int | None = None,
         max_error_rate: float | None = None,
         over_saturation: dict[str, Any] | None = None,
-        data: str = "prompt_tokens=256,output_tokens=128",
+        data: str | Path = "prompt_tokens=256,output_tokens=128",
         processor: str = "gpt2",
         additional_args: str = "",
         extra_env: dict[str, str] | None = None,
@@ -59,13 +59,13 @@ def start_benchmark(
         Start a guidellm benchmark command.
 
         :param profile: Type of rate control (constant, etc.)
-        :param rate: Request rate
+        :param rate: Request rate (or time_scale for replay profile)
         :param max_seconds: Maximum duration in seconds
         :param max_requests: Maximum number of requests
         :param max_error_rate: Maximum error rate before stopping
         :param over_saturation: Over-saturation detection configuration (dict).
             Passed as JSON string to --over-saturation CLI argument.
-        :param data: Data configuration string
+        :param data: Data configuration string or Path to trace file for replay profile
         :param processor: Processor/tokenizer to use
         :param additional_args: Additional command line arguments
         :param extra_env: Additional environment variables to set
@@ -109,7 +109,7 @@ def start_benchmark(
 
         cmd_parts.extend(
             [
-                f'--data "{data}"',
+                f'--data "{str(data)}"',
                 f'--processor "{processor}"',
                 f"--output-dir {self.output_dir}",
                 f"--outputs {self.outputs}",

From edc18bad7331b9d1b988ffb98dec211c5ff0b86f Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Tue, 17 Mar 2026 17:39:32 +0100
Subject: [PATCH 04/27] fix ruff error CI

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/profiles.py          | 3 ++-
 tests/unit/benchmark/test_replay_profile.py | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index 11c1de0fe..5501d08e5 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -424,7 +424,8 @@ def resolve_args(
             "relative_timestamps": relative_timestamps,
             "time_scale": time_scale,
             "constraints": constraints,
-            "max_seconds_filter": max_seconds if max_seconds and max_seconds > 0
+            "max_seconds_filter": max_seconds
+            if max_seconds and max_seconds > 0
             else None,
         }
 
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index 996abb081..cb32273aa 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -57,8 +57,7 @@ def test_resolve_args_requires_data(self):
                 [
                     '{"timestamp": 0, "input_length": 8192, "output_length": 1024}',
                     '{"timestamp": 0.5, "input_length": 32768, "output_length": 4096}',
-                    '{"timestamp": 1.0, "input_length": 131072,'
-                    '"output_length": 16384}',
+                    '{"timestamp": 1.0, "input_length": 131072,"output_length": 16384}',
                 ],
                 [1.0],
                 [0.0, 0.5, 1.0],

From 18433f52de5ef8739c8cfd5383b04ef631c20856 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Wed, 18 Mar 2026 12:30:06 +0100
Subject: [PATCH 05/27] Add trace replay documentation

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 docs/getting-started/benchmark.md | 30 ++++++++++++++++++++++++++++++
 docs/guides/datasets.md           |  5 +++++
 2 files changed, 35 insertions(+)

diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
index 6173907f6..d5a8ed11d 100644
--- a/docs/getting-started/benchmark.md
+++ b/docs/getting-started/benchmark.md
@@ -82,6 +82,14 @@ For example, setting `--max-requests 1000` with `--profile sweep` will run 1000
 
 GuideLLM supports several benchmark profiles and strategies, which are described in detail below.
 
+- `synchronous`: Runs requests one at a time (sequential)
+- `throughput`: Tests maximum throughput by running requests in parallel
+- `concurrent`: Runs a fixed number of parallel request streams
+- `constant`: Sends requests at a fixed rate per second
+- `poisson`: Sends requests following a Poisson distribution
+- `sweep`: Automatically determines optimal performance points (default)
+- `replay`: Replays requests from a trace file to reproduce real-world traffic patterns (beta)
+
 #### Synchronous Profile
 
 Runs requests one at a time (sequential).
@@ -187,6 +195,28 @@ guidellm benchmark \
 
 You can customize synthetic data generation with additional parameters such as standard deviation, minimum, and maximum values. See the [Datasets Synthetic data documentation](../guides/datasets.md#synthetic-data) for more details.
 
+### Trace Replay Benchmarking (beta)
+
+For realistic load testing, replay traffic patterns from trace files. Trace files must be JSONL with `timestamp`, `input_length`, and `output_length` fields:
+
+```json
+{"timestamp": 0, "input_length": 256, "output_length": 128}
+{"timestamp": 0.5, "input_length": 512, "output_length": 64}
+```
+
+Run with the `replay` profile:
+
+```bash
+guidellm benchmark \
+  --target "http://localhost:8000" \
+  --data "path/to/trace.jsonl" \
+  --data-args '{"type_": "trace_synthetic"}' \
+  --profile replay \
+  --rate 1.0
+```
+
+The `rate` parameter acts as a time scale: `1.0` for original speed, `2.0` for 2x faster, `0.5` for half speed.
+
 ### Working with Real Data
 
 While synthetic data is convenient for quick tests, you can benchmark with real-world data:
diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md
index e5104d1ec..9c80d8edc 100644
--- a/docs/guides/datasets.md
+++ b/docs/guides/datasets.md
@@ -131,6 +131,11 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
   {"prompt": "Hello, how are you?", "output_tokens_count": 5, "additional_column": "foo", "additional_column2": "bar"}
   {"prompt": "What is your name?", "output_tokens_count": 3, "additional_column": "baz", "additional_column2": "qux"}
   ```
+- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to reproduce production traffic patterns. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
+  ```json
+  {"timestamp": 0, "input_length": 256, "output_length": 128}
+  {"timestamp": 0.5, "input_length": 512, "output_length": 64}
+  ```
 - **JSON files (`.json`)**: Where the entire dataset is represented as a JSON array of objects nested under a specific key. To surface the correct key to use, a `--data-column-mapper` argument must be passed in of `"field": "NAME"` for where the array exists. The objects should include `prompt` or other common names for the prompt which will be used as the prompt column. Additional fields can be included based on the previously mentioned aliases for the `--data-column-mapper` argument.
   ```json
   {

From a8e64440a3a2b89b1f1470efec2df08f923bfd8d Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Mon, 20 Apr 2026 10:08:37 +0200
Subject: [PATCH 06/27] refactor: move trace_io to utils for cross-component
 sharing

- Relocate trace_io module from data/ to utils/
- Update imports in scheduler/strategies.py
- Update imports in benchmark/profiles.py
- Update imports in data/deserializers/trace_synthetic.py
- Update imports in tests/unit/scheduler/test_trace_replay.py

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/profiles.py                 | 2 +-
 src/guidellm/data/deserializers/trace_synthetic.py | 2 +-
 src/guidellm/scheduler/__init__.py                 | 2 --
 src/guidellm/scheduler/strategies.py               | 2 --
 src/guidellm/{data => utils}/trace_io.py           | 0
 tests/unit/scheduler/test_trace_replay.py          | 7 ++-----
 6 files changed, 4 insertions(+), 11 deletions(-)
 rename src/guidellm/{data => utils}/trace_io.py (100%)

diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index 5501d08e5..8d4dec39d 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -39,9 +39,9 @@
     SynchronousStrategy,
     ThroughputStrategy,
     TraceReplayStrategy,
-    load_relative_timestamps,
 )
 from guidellm.schemas import PydanticClassRegistryMixin
+from guidellm.utils.trace_io import load_relative_timestamps
 
 if TYPE_CHECKING:
     from guidellm.benchmark.schemas import Benchmark
diff --git a/src/guidellm/data/deserializers/trace_synthetic.py b/src/guidellm/data/deserializers/trace_synthetic.py
index fe366f69d..ab9849d5d 100644
--- a/src/guidellm/data/deserializers/trace_synthetic.py
+++ b/src/guidellm/data/deserializers/trace_synthetic.py
@@ -20,7 +20,7 @@
     DatasetDeserializer,
     DatasetDeserializerFactory,
 )
-from guidellm.data.trace_io import load_trace_rows
+from guidellm.utils.trace_io import load_trace_rows
 
 __all__ = ["TraceSyntheticDatasetDeserializer"]
 
diff --git a/src/guidellm/scheduler/__init__.py b/src/guidellm/scheduler/__init__.py
index c772f0ff5..1aafd994b 100644
--- a/src/guidellm/scheduler/__init__.py
+++ b/src/guidellm/scheduler/__init__.py
@@ -51,7 +51,6 @@
     SynchronousStrategy,
     ThroughputStrategy,
     TraceReplayStrategy,
-    load_relative_timestamps,
 )
 from .worker import WorkerProcess
 from .worker_group import WorkerProcessGroup
@@ -96,5 +95,4 @@
     "UnserializableConstraintInitializer",
     "WorkerProcess",
     "WorkerProcessGroup",
-    "load_relative_timestamps",
 ]
diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
index c3c5714c5..316a532a4 100644
--- a/src/guidellm/scheduler/strategies.py
+++ b/src/guidellm/scheduler/strategies.py
@@ -26,7 +26,6 @@
 
 from pydantic import Field, NonNegativeFloat, NonNegativeInt, PositiveInt, PrivateAttr
 
-from guidellm.data.trace_io import load_relative_timestamps
 from guidellm.schemas import PydanticClassRegistryMixin, RequestInfo
 from guidellm.utils.mixins import InfoMixin
 
@@ -40,7 +39,6 @@
     "SynchronousStrategy",
     "ThroughputStrategy",
     "TraceReplayStrategy",
-    "load_relative_timestamps",
 ]
 
 
diff --git a/src/guidellm/data/trace_io.py b/src/guidellm/utils/trace_io.py
similarity index 100%
rename from src/guidellm/data/trace_io.py
rename to src/guidellm/utils/trace_io.py
diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
index f1eb34ac7..f4302b645 100644
--- a/tests/unit/scheduler/test_trace_replay.py
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -17,12 +17,9 @@
 
 import pytest
 
-from guidellm.scheduler import (
-    SchedulingStrategy,
-    TraceReplayStrategy,
-    load_relative_timestamps,
-)
+from guidellm.scheduler import SchedulingStrategy, TraceReplayStrategy
 from guidellm.schemas import RequestInfo
+from guidellm.utils.trace_io import load_relative_timestamps
 
 
 def _write_trace(path: Path, content: str) -> Path:

From 584f753970b372fc00a8e7a12f9894948d814ec1 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Mon, 20 Apr 2026 11:22:03 +0200
Subject: [PATCH 07/27] replace manual trace loading with datasets.load_dataset

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/utils/trace_io.py | 72 +++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/src/guidellm/utils/trace_io.py b/src/guidellm/utils/trace_io.py
index 373d5f355..12f76c3f6 100644
--- a/src/guidellm/utils/trace_io.py
+++ b/src/guidellm/utils/trace_io.py
@@ -8,10 +8,11 @@
 
 from __future__ import annotations
 
-import json
 from pathlib import Path
 from typing import Any
 
+from datasets import Dataset, load_dataset
+
 __all__ = ["load_relative_timestamps", "load_trace_rows"]
 
 
@@ -19,50 +20,51 @@ def load_trace_rows(
     path: Path | str,
     required_columns: list[str] | None = None,
     max_rows: int | None = None,
-) -> list[dict[str, Any]]:
+    **data_kwargs: Any,
+) -> Dataset:
     """
-    Load trace file rows as a list of dicts.
+    Load trace file rows as a HuggingFace Dataset.
 
     Supports .jsonl only (one JSON object per line).
-    If required_columns is set, every row must contain these keys; otherwise
-    KeyError is raised with a descriptive message.
-    If max_rows is set, only the first max_rows rows are loaded (for replay
+    If required_columns is set, every column must exist in the dataset;
+    otherwise KeyError is raised with a descriptive message.
+    If max_rows is set, only the first max_rows rows are returned (for replay
     with a request limit).
 
     :param path: Path to the trace file.
     :param required_columns: Optional list of column/field names that each row
         must have.
     :param max_rows: Optional maximum number of rows to load; None means load all.
-        If set to a value less than 1, returns an empty list.
-    :return: List of row dicts (keys and values as in the file).
-    :raises KeyError: If a required column is missing in the file or in a row.
+        If set to a value less than 1, returns an empty Dataset.
+    :param data_kwargs: Additional keyword arguments forwarded to load_dataset.
+    :return: HuggingFace Dataset (iterable as dicts, column-accessible).
+    :raises KeyError: If a required column is missing in the dataset.
     :raises ValueError: If the file format is not .jsonl.
     """
     path = Path(path)
-    if max_rows is not None and max_rows < 1:
-        return []
     suffix = path.suffix.lower()
     if suffix != ".jsonl":
         raise ValueError(f"Unsupported trace file format: {suffix}")
+    if path.stat().st_size == 0:
+        raise ValueError(f"Trace file is empty or has no valid rows: {path}")
+
+    trace_dataset = load_dataset(
+        "json", data_files=str(path), split="train", **data_kwargs
+    )
 
-    rows: list[dict[str, Any]] = []
-    with path.open(encoding="utf-8") as f:
-        for raw_line in f:
-            if max_rows is not None and len(rows) >= max_rows:
-                break
-            line = raw_line.strip()
-            if not line:
-                continue
-            row = json.loads(line)
-            if not isinstance(row, dict):
-                continue
-            if required_columns:
-                missing = [c for c in required_columns if c not in row]
-                if missing:
-                    raise KeyError(f"Trace row missing required columns: {missing}")
-            rows.append(row)
-
-    return rows
+    if required_columns:
+        missing = [c for c in required_columns if c not in trace_dataset.column_names]
+        if missing:
+            raise KeyError(f"Trace row missing required columns: {missing}")
+
+    if max_rows is not None:
+        if max_rows < 1:
+            return trace_dataset.select([])
+        trace_dataset = trace_dataset.select(
+            range(min(max_rows, len(trace_dataset)))
+        )
+
+    return trace_dataset
 
 
 def load_relative_timestamps(
@@ -81,12 +83,10 @@ def load_relative_timestamps(
     :return: List of relative timestamps in seconds (first is 0.0, always sorted).
     :raises ValueError: If the trace file is empty or has no valid rows.
     """
-    raw = load_trace_rows(
-        path,
-        required_columns=[timestamp_column],
-    )
-    timestamps = sorted([float(row[timestamp_column]) for row in raw])
-    if not timestamps:
-        raise ValueError(f"Trace file has no valid rows: {path}")
+    trace_dataset = load_trace_rows(path, required_columns=[timestamp_column])
+    if len(trace_dataset) == 0:
+        raise ValueError(f"Trace file is empty or has no valid rows: {path}")
+    trace_dataset = trace_dataset.sort(timestamp_column)
+    timestamps = [float(t) for t in trace_dataset[timestamp_column]]
     t0 = timestamps[0]
     return [t - t0 for t in timestamps]

From cde76f472b27d57b83af27f3d83117224476ae92 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Mon, 20 Apr 2026 16:01:15 +0200
Subject: [PATCH 08/27] refactor benchmark.entrypoints: remove max_requests
 data truncation

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/entrypoints.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index f623a7ab3..07bec59ae 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -250,7 +250,6 @@ async def resolve_request_loader(
     data_num_workers: int | None,
     random_seed: int,
     console: Console | None = None,
-    max_requests: int | None = None,
     **dataloader_kwargs: dict[str, Any] | None,
 ) -> DataLoader[GenerationRequest]:
     """
@@ -274,7 +273,6 @@ async def resolve_request_loader(
     :param data_num_workers: Number of worker processes for data loading
     :param random_seed: Seed for reproducible random operations
     :param console: Console instance for progress reporting, or None
-    :param max_requests: If set, first data source loads at most this many rows.
     :param dataloader_kwargs: Additional arguments passed to DataLoader initialization
     :return: Configured DataLoader instance for GenerationRequest objects
     :raises ValueError: If request formatter type is not registered in
@@ -311,17 +309,6 @@ async def resolve_request_loader(
         data_finalizer,
     )
 
-    # When max_requests is set, limit the first data source to that many rows at load
-    if max_requests is not None and data:
-        if max_requests < 1:
-            raise ValueError(
-                "max_requests must be >= 1 when set for data truncation, "
-                f"got {max_requests}"
-            )
-        data_args = list(data_args) if data_args else [{} for _ in data]
-        if len(data_args) >= 1:
-            data_args[0] = {**data_args[0], "max_rows": max_requests}
-
     request_loader: DataLoader[GenerationRequest] = DataLoader(
         data=data,
         data_args=data_args,
@@ -539,24 +526,14 @@ async def benchmark_generative_text(
         "console": console,
     }
 
-    # For replay profile: resolve profile first to apply max_seconds filtering,
-    # then use the filtered count for the data loader. This ensures the data
-    # loader and scheduler both work with the same filtered request count.
     if args.profile == "replay":
         profile = await resolve_profile(**profile_kwargs, data=args.data)  # type: ignore[arg-type]
-        effective_max_requests = (
-            profile.constraints.get("max_requests")
-            if profile.constraints
-            else args.max_requests
-        )
         request_loader = await resolve_request_loader(
             **loader_kwargs,  # type: ignore[arg-type,misc]
-            max_requests=effective_max_requests,  # type: ignore[arg-type]
         )
     else:
         request_loader = await resolve_request_loader(
             **loader_kwargs,  # type: ignore[arg-type,misc]
-            max_requests=args.max_requests,  # type: ignore[arg-type]
         )
         profile = await resolve_profile(**profile_kwargs, data=None)  # type: ignore[arg-type]
 

From a229476bfdc2b85bc6cce8db8359af66c89be4d5 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Tue, 21 Apr 2026 11:05:47 +0200
Subject: [PATCH 09/27] refactor benchmark.entrypoint: remove replay special
 case

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/entrypoints.py | 78 ++++++++++++---------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index 07bec59ae..c461a0b72 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -492,50 +492,24 @@ async def benchmark_generative_text(
     processor = await resolve_processor(
         processor=args.processor, model=model, console=console
     )
-
-    # Build common kwargs for resolve_profile and resolve_request_loader
-    profile_kwargs = {
-        "profile": args.profile,
-        "rate": args.rate,
-        "random_seed": args.random_seed,
-        "rampup": args.rampup,
-        "constraints": constraints,
-        "max_seconds": args.max_seconds,
-        "max_requests": args.max_requests,
-        "max_errors": args.max_errors,
-        "max_error_rate": args.max_error_rate,
-        "max_global_error_rate": args.max_global_error_rate,
-        "over_saturation": args.over_saturation,
-        "console": console,
-    }
-    loader_kwargs = {
-        "data": args.data,
-        "model": model,
-        "data_args": args.data_args,
-        "data_samples": args.data_samples,
-        "processor": processor,
-        "processor_args": args.processor_args,
-        "data_column_mapper": args.data_column_mapper,
-        "data_preprocessors": args.data_preprocessors,
-        "data_preprocessors_kwargs": args.data_preprocessors_kwargs,
-        "data_finalizer": args.data_finalizer,
-        "data_collator": args.data_collator,
-        "data_sampler": args.data_sampler,
-        "data_num_workers": args.data_num_workers,
-        "random_seed": args.random_seed,
-        "console": console,
-    }
-
-    if args.profile == "replay":
-        profile = await resolve_profile(**profile_kwargs, data=args.data)  # type: ignore[arg-type]
-        request_loader = await resolve_request_loader(
-            **loader_kwargs,  # type: ignore[arg-type,misc]
-        )
-    else:
-        request_loader = await resolve_request_loader(
-            **loader_kwargs,  # type: ignore[arg-type,misc]
-        )
-        profile = await resolve_profile(**profile_kwargs, data=None)  # type: ignore[arg-type]
+    request_loader = await resolve_request_loader(
+        data=args.data,
+        model=model,
+        data_args=args.data_args,
+        data_samples=args.data_samples,
+        processor=processor,
+        processor_args=args.processor_args,
+        data_column_mapper=args.data_column_mapper,
+        data_preprocessors=args.data_preprocessors,
+        data_preprocessors_kwargs=args.data_preprocessors_kwargs,
+        data_finalizer=args.data_finalizer,
+        data_collator=args.data_collator,
+        data_sampler=args.data_sampler,
+        data_num_workers=args.data_num_workers,
+        random_seed=args.random_seed,
+        console=console,
+        **(args.dataloader_kwargs or {}),
+    )
 
     warmup = TransientPhaseConfig.create_from_value(args.warmup)
     cooldown = TransientPhaseConfig.create_from_value(args.cooldown)
@@ -551,6 +525,22 @@ async def benchmark_generative_text(
             ),
             status="success",
         )
+
+    profile = await resolve_profile(
+        profile=args.profile,
+        rate=args.rate,
+        random_seed=args.random_seed,
+        rampup=args.rampup,
+        constraints=constraints,
+        max_seconds=args.max_seconds,
+        max_requests=args.max_requests,
+        max_errors=args.max_errors,
+        max_error_rate=args.max_error_rate,
+        max_global_error_rate=args.max_global_error_rate,
+        over_saturation=args.over_saturation,
+        console=console,
+        data=args.data,
+    )
     output_formats = await resolve_output_formats(
         outputs=args.outputs, output_dir=args.output_dir, console=console
     )

From d43e920cfe3cd3e16fd37c2a7876678c890f753e Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Tue, 21 Apr 2026 14:54:29 +0200
Subject: [PATCH 10/27] fix replay profile dataset filtering semantics

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/entrypoints.py       |   8 +
 src/guidellm/benchmark/profiles.py          |  52 +----
 tests/e2e/test_successful_benchmark.py      |  20 +-
 tests/unit/benchmark/test_replay_profile.py | 207 ++------------------
 4 files changed, 44 insertions(+), 243 deletions(-)

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index c461a0b72..91344768d 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -356,6 +356,7 @@ async def resolve_profile(
     over_saturation: dict[str, Any] | None = None,
     console: Console | None = None,
     data: list[Any] | None = None,
+    **profile_kwargs: Any,
 ) -> Profile:
     """
     Resolve and configure a benchmark profile with rate and constraint settings.
@@ -378,6 +379,7 @@ async def resolve_profile(
     :param over_saturation: Over-saturation detection configuration (dict)
     :param console: Console instance for progress reporting, or None
     :param data: Optional list of data sources.
+    :param profile_kwargs: Additional profile-specific arguments.
     :return: Configured Profile instance ready for benchmarking
     :raises ValueError: If constraints are provided with a pre-configured Profile
     """
@@ -406,6 +408,7 @@ async def resolve_profile(
             rampup_duration=rampup,
             constraints={**constraints},
             data=data,
+            **profile_kwargs,
         )
     elif constraints:
         raise ValueError(
@@ -526,6 +529,10 @@ async def benchmark_generative_text(
             status="success",
         )
 
+    profile_kwargs: dict[str, Any] = {}
+    if args.profile == "replay":
+        profile_kwargs["data_samples"] = request_loader.info.get("data_samples", -1)
+
     profile = await resolve_profile(
         profile=args.profile,
         rate=args.rate,
@@ -540,6 +547,7 @@ async def benchmark_generative_text(
         over_saturation=args.over_saturation,
         console=console,
         data=args.data,
+        **profile_kwargs,
     )
     output_formats = await resolve_output_formats(
         outputs=args.outputs, output_dir=args.output_dir, console=console
diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index 8d4dec39d..6c31cafa8 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -341,11 +341,9 @@ class ReplayProfile(Profile):
     For this profile, the ``rate`` argument is interpreted as time_scale (scale factor
     applied to relative timestamps), not as requests per second.
 
-    When ``constraints["max_requests"]`` is set, the trace is truncated at load time:
-    only the first max_requests rows are loaded from the file for both timestamps (here)
-    and request data (in the data loader). This keeps timestamps and requests aligned.
     The trace file is read twice: once by the data pipeline for request payloads, and
-    once here for relative timestamps.
+    once here for relative timestamps. When ``data_samples`` is set, the replayed
+    timestamps are truncated to match the sampled dataset size.
     """
 
     type_: Literal["replay"] = "replay"  # type: ignore[assignment]
@@ -357,13 +355,6 @@ class ReplayProfile(Profile):
         gt=0,
         description="Scale factor applied to relative timestamps",
     )
-    max_seconds_filter: float | None = Field(
-        default=None,
-        description=(
-            "Original max_seconds value used as a load-time filter "
-            "(not a runtime constraint)"
-        ),
-    )
 
     @classmethod
     def resolve_args(
@@ -380,53 +371,26 @@ def resolve_args(
         path = Path(data[0]) if isinstance(data[0], str) else data[0]
         if not path.exists():
             raise ValueError(f"Replay trace file not found: {path}")
-        constraints = kwargs.get("constraints") or {}
-        max_requests = constraints.get("max_requests")
-        if max_requests is not None and max_requests < 1:
-            raise ValueError(
-                "max_requests must be >= 1 when set for replay profile, "
-                f"got {max_requests}"
-            )
 
         # For replay profile, rate is interpreted as time_scale (not requests per
         # second)
         time_scale = rate[0] if rate and len(rate) > 0 else 1.0
 
-        # Load all timestamps first (max_requests applied after max_seconds filtering)
         relative_timestamps = load_relative_timestamps(path)
-
-        # Filter by max_seconds (applied in simulated time via time_scale)
-        max_seconds = constraints.get("max_seconds")
-        if max_seconds is not None and max_seconds > 0:
-            relative_timestamps = [
-                ts for ts in relative_timestamps if ts * time_scale <= max_seconds
-            ]
-
-        # Truncate by max_requests on top of any max_seconds filtering
-        if max_requests is not None:
-            relative_timestamps = relative_timestamps[:max_requests]
+        data_samples = kwargs.get("data_samples", -1)
+        if isinstance(data_samples, int) and data_samples > 0:
+            relative_timestamps = relative_timestamps[:data_samples]
 
         if not relative_timestamps:
             raise ValueError(
-                "No timestamps remain after applying max_seconds and max_requests "
-                "filters. The trace is empty or all events were filtered out."
+                "No timestamps remain after applying data_samples. "
+                "The trace is empty or all events were filtered out."
             )
 
-        # Set max_requests to the actual count after filtering to prevent benchmark hang
-        # and eliminate race conditions between request completion and injection.
-        constraints["max_requests"] = len(relative_timestamps)
-
-        # Remove max_seconds to avoid runtime MaxDurationConstraint canceling
-        # in-flight requests
-        constraints.pop("max_seconds", None)
-
         return {
             "relative_timestamps": relative_timestamps,
             "time_scale": time_scale,
-            "constraints": constraints,
-            "max_seconds_filter": max_seconds
-            if max_seconds and max_seconds > 0
-            else None,
+            "constraints": kwargs.get("constraints"),
         }
 
     @property
diff --git a/tests/e2e/test_successful_benchmark.py b/tests/e2e/test_successful_benchmark.py
index 85ada36e6..0de80f91f 100644
--- a/tests/e2e/test_successful_benchmark.py
+++ b/tests/e2e/test_successful_benchmark.py
@@ -234,15 +234,13 @@ def test_replay_profile_max_seconds_stronger_than_max_requests(
 ):
     """
     Test replay profile where max_seconds is the limiting constraint.
-    Trace has 20 requests over 2 seconds, but max_seconds=0.3 limits to ~3 requests.
-    max_requests=10 is not reached because max_seconds triggers first.
+    Trace has 20 requests over 2 seconds, but max_seconds=0.25 stops the replay
+    before max_requests=10 can be reached.
     """
     report_name = "replay_max_seconds_stronger.json"
     report_path = tmp_path / report_name
 
     # Create trace with 20 requests at 0.1s intervals
-    # With time_scale=1.0, timestamps are: 0.0, 0.1, 0.2, 0.3, 0.4, ...
-    # max_seconds=0.25 should include: 0.0, 0.1, 0.2 (3 requests, 0.3 > 0.25)
     trace_file = _create_trace_file(tmp_path, num_requests=20, interval=0.1)
 
     client = GuidellmClient(
@@ -251,8 +249,7 @@ def test_replay_profile_max_seconds_stronger_than_max_requests(
         outputs=report_name,
     )
 
-    # max_seconds=0.25 should be the limiting constraint
-    # Only timestamps <= 0.25 should be kept: 0.0, 0.1, 0.2
+    # max_seconds=0.25 should be the limiting runtime constraint
     client.start_benchmark(
         profile="replay",
         rate=1.0,
@@ -268,15 +265,14 @@ def test_replay_profile_max_seconds_stronger_than_max_requests(
     report = load_benchmark_report(report_path)
     benchmark = report["benchmarks"][0]
 
-    # Should have 3 requests (0.0, 0.1, 0.2 where 0.2 <= 0.25)
     successful_requests = benchmark["requests"]["successful"]
-    assert len(successful_requests) == 3, (
-        f"Expected 3 requests (max_seconds=0.25 filter), got {len(successful_requests)}"
+    assert 0 < len(successful_requests) < 10, (
+        "Expected max_seconds to stop replay before max_requests was reached, "
+        f"got {len(successful_requests)} successful requests"
     )
 
-    # Verify max_requests constraint was triggered
-    # (max_seconds is converted to max_requests internally)
-    assert_constraint_triggered(benchmark, "max_requests", {"processed_exceeded": True})
+    # Verify the runtime max_seconds constraint was triggered
+    assert_constraint_triggered(benchmark, "max_seconds", {"duration_exceeded": True})
 
 
 # Helper functions for trace file creation
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index cb32273aa..dea074d27 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -10,8 +10,6 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Any
-
 import pytest
 
 from guidellm.benchmark.profiles import Profile, ReplayProfile
@@ -145,27 +143,10 @@ def test_next_strategy_returns_trace_then_none(self, tmp_path: Path):
         assert s1.time_scale == 1.0
         assert profile.next_strategy(s1, None) is None
 
-    @pytest.mark.smoke
-    def test_max_requests_less_than_one_raises(self, tmp_path: Path):
-        """max_requests < 1 in constraints raises ValueError."""
-        trace = _trace_path(
-            tmp_path,
-            ['{"timestamp": 0, "input_length": 1, "output_length": 1}'],
-        )
-        with pytest.raises(ValueError, match="max_requests must be >= 1"):
-            ReplayProfile.resolve_args(
-                rate_type="replay",
-                rate=[1.0],
-                random_seed=42,
-                data=[str(trace)],
-                constraints={"max_requests": 0},
-            )
-
     @pytest.mark.smoke
     @pytest.mark.parametrize(
-        ("trace_lines", "max_req", "expected_ts"),
+        ("trace_lines", "data_samples", "expected_ts"),
         [
-            # Basic truncation
             (
                 [
                     '{"timestamp": 0, "input_length": 1, "output_length": 1}',
@@ -176,7 +157,6 @@ def test_max_requests_less_than_one_raises(self, tmp_path: Path):
                 2,
                 [0.0, 1.0],
             ),
-            # Truncate concurrent burst (first 2 of 5 same-timestamp requests)
             (
                 [
                     '{"timestamp": 1.0, "input_length": 100, "output_length": 10}',
@@ -188,9 +168,6 @@ def test_max_requests_less_than_one_raises(self, tmp_path: Path):
                 2,
                 [0.0, 0.0],
             ),
-            # Truncate after sorting (all rows loaded, sorted, then truncated)
-            # File order: 5.0, 2.0, 8.0, 1.0 -> sorted: 1.0, 2.0, 5.0, 8.0
-            # Relative: 0.0, 1.0, 4.0, 7.0 -> truncated to 3: 0.0, 1.0, 4.0
             (
                 [
                     '{"timestamp": 5.0, "input_length": 100, "output_length": 10}',
@@ -199,133 +176,49 @@ def test_max_requests_less_than_one_raises(self, tmp_path: Path):
                     '{"timestamp": 1.0, "input_length": 400, "output_length": 40}',
                 ],
                 3,
-                [0.0, 1.0, 4.0],  # 1.0->0.0, 2.0->1.0, 5.0->4.0
+                [0.0, 1.0, 4.0],
             ),
         ],
     )
-    def test_max_requests_truncates_timestamps(
-        self, tmp_path: Path, trace_lines, max_req, expected_ts
+    def test_data_samples_truncates_timestamps(
+        self, tmp_path: Path, trace_lines, data_samples, expected_ts
     ):
-        """max_requests truncates timestamps to first N rows (handles
-        duplicates/unsorted)."""
+        """data_samples truncates replay timestamps to match sampled dataset rows."""
         trace = _trace_path(tmp_path, trace_lines)
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
             rate=[1.0],
             random_seed=42,
             data=[str(trace)],
-            constraints={"max_requests": max_req},
+            data_samples=data_samples,
         )
         assert kwargs["relative_timestamps"] == pytest.approx(expected_ts, abs=1e-9)
 
     @pytest.mark.smoke
-    @pytest.mark.parametrize(
-        ("trace_lines", "rate", "max_seconds", "expected_ts"),
-        [
-            # Basic: time_scale=1.0, max_seconds=1.5 keeps timestamps <= 1.5
-            (
-                [
-                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                    '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
-                    '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
-                    '{"timestamp": 2.0, "input_length": 4, "output_length": 4}',
-                ],
-                [1.0],  # time_scale = 1.0
-                1.5,
-                [0.0, 0.5, 1.0],  # 2.0 * 1.0 = 2.0 > 1.5, so excluded
-            ),
-            # With time_scale=2.0: effective times are 0, 1.0, 2.0, 4.0
-            # max_seconds=1.5 keeps only timestamps where ts * 2.0 <= 1.5
-            (
-                [
-                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                    '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
-                    '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
-                    '{"timestamp": 2.0, "input_length": 4, "output_length": 4}',
-                ],
-                [2.0],  # time_scale = 2.0
-                1.5,
-                [0.0, 0.5],  # 1.0 * 2.0 = 2.0 > 1.5, so excluded
-            ),
-            # With time_scale=0.5 (speedup): effective times are 0, 0.25, 0.5, 1.0
-            # max_seconds=0.8 keeps only timestamps where ts * 0.5 <= 0.8
-            (
-                [
-                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                    '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
-                    '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
-                    '{"timestamp": 2.0, "input_length": 4, "output_length": 4}',
-                ],
-                [0.5],  # time_scale = 0.5
-                0.8,
-                [0.0, 0.5, 1.0],  # 2.0 * 0.5 = 1.0 > 0.8, so excluded
-            ),
-            # max_seconds larger than all timestamps: all kept
-            (
-                [
-                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                    '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
-                ],
-                [1.0],
-                10.0,
-                [0.0, 1.0],
-            ),
-            # max_seconds very small: only first timestamp kept
-            (
-                [
-                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                    '{"timestamp": 0.1, "input_length": 2, "output_length": 2}',
-                    '{"timestamp": 0.2, "input_length": 3, "output_length": 3}',
-                ],
-                [1.0],
-                0.05,
-                [0.0],
-            ),
-        ],
-    )
-    def test_max_seconds_filters_timestamps_with_time_scale(
-        self, tmp_path: Path, trace_lines, rate, max_seconds, expected_ts
-    ):
-        """max_seconds filters timestamps based on effective time (ts * time_scale)."""
-        trace = _trace_path(tmp_path, trace_lines)
-        kwargs = ReplayProfile.resolve_args(
-            rate_type="replay",
-            rate=rate,
-            random_seed=42,
-            data=[str(trace)],
-            constraints={"max_seconds": max_seconds},
-        )
-        assert kwargs["relative_timestamps"] == pytest.approx(expected_ts, abs=1e-9)
-        assert kwargs["time_scale"] == rate[0]
-
-    @pytest.mark.smoke
-    def test_max_seconds_with_max_requests_both_apply(self, tmp_path: Path):
-        """Both max_seconds and max_requests constraints apply (intersection)."""
+    def test_constraints_remain_runtime_only(self, tmp_path: Path):
+        """Runtime constraints are preserved and do not filter replay timestamps."""
         trace = _trace_path(
             tmp_path,
             [
                 '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
-                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
-                '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
-                '{"timestamp": 4.0, "input_length": 5, "output_length": 5}',
+                '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
+                '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
             ],
         )
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
-            rate=[1.0],
+            rate=[2.0],
             random_seed=42,
             data=[str(trace)],
-            constraints={"max_requests": 4, "max_seconds": 2.5},
+            constraints={"max_requests": 2, "max_seconds": 1.5},
         )
-        # max_requests limits to first 4: [0, 1.0, 2.0, 3.0]
-        # Then max_seconds filters to <= 2.5: [0, 1.0, 2.0]
-        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0, 2.0], abs=1e-9)
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 0.5, 1.0], abs=1e-9)
+        assert kwargs["constraints"] == {"max_requests": 2, "max_seconds": 1.5}
+        assert kwargs["time_scale"] == 2.0
 
     @pytest.mark.smoke
-    def test_max_seconds_filters_and_sets_max_requests(self, tmp_path: Path):
-        """max_seconds filters timestamps at load time and max_requests is set to
-        the actual count to synchronize the data loader and prevent benchmark hang."""
+    def test_data_samples_and_constraints_are_independent(self, tmp_path: Path):
+        """data_samples truncates timestamps without mutating runtime constraints."""
         trace = _trace_path(
             tmp_path,
             [
@@ -336,73 +229,13 @@ def test_max_seconds_filters_and_sets_max_requests(self, tmp_path: Path):
                 '{"timestamp": 4.0, "input_length": 5, "output_length": 5}',
             ],
         )
-        constraints: dict[str, Any] = {"max_seconds": 2.5}
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
             rate=[1.0],
             random_seed=42,
             data=[str(trace)],
-            constraints=constraints,
+            data_samples=3,
+            constraints={"max_requests": 10, "max_seconds": 0.25},
         )
-        # max_seconds=2.5 with time_scale=1.0 keeps ts <= 2.5: [0, 1.0, 2.0]
-        # = 3 requests
         assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0, 2.0], abs=1e-9)
-        # max_requests is always set to actual count after filtering
-        assert constraints.get("max_requests") == 3
-        # max_seconds is removed to avoid runtime constraint conflicts
-        assert "max_seconds" not in constraints
-
-    @pytest.mark.smoke
-    def test_max_requests_always_updated_to_actual_count(self, tmp_path: Path):
-        """max_requests is always set to the actual count of timestamps after
-        filtering."""
-        trace = _trace_path(
-            tmp_path,
-            [
-                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
-                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
-                '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
-            ],
-        )
-        constraints: dict[str, Any] = {"max_requests": 2, "max_seconds": 10.0}
-        kwargs = ReplayProfile.resolve_args(
-            rate_type="replay",
-            rate=[1.0],
-            random_seed=42,
-            data=[str(trace)],
-            constraints=constraints,
-        )
-        # max_requests=2 takes first 2 timestamps: [0, 1.0]
-        # max_seconds=10.0 keeps all (ts * 1.0 <= 10.0)
-        # Result: [0, 1.0] - but max_requests is always updated to actual count
-        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0], abs=1e-9)
-        # constraints['max_requests'] is always set to actual count after filtering
-        assert constraints.get("max_requests") == 2  # matches len(relative_timestamps)
-
-    @pytest.mark.smoke
-    def test_max_seconds_removed_from_constraints(self, tmp_path: Path):
-        """max_seconds is removed from constraints after load-time filtering."""
-        trace = _trace_path(
-            tmp_path,
-            [
-                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
-                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
-            ],
-        )
-        constraints: dict[str, Any] = {"max_seconds": 1.5}
-        kwargs = ReplayProfile.resolve_args(
-            rate_type="replay",
-            rate=[1.0],
-            random_seed=42,
-            data=[str(trace)],
-            constraints=constraints,
-        )
-        # max_seconds should be removed to avoid runtime MaxDurationConstraint
-        assert "max_seconds" not in constraints
-        assert kwargs["constraints"] is constraints
-        # Verify timestamps were filtered: ts <= 1.5 -> [0, 1.0]
-        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0], abs=1e-9)
-        # max_requests set to actual count
-        assert constraints.get("max_requests") == 2
+        assert kwargs["constraints"] == {"max_requests": 10, "max_seconds": 0.25}

From c0739e4c12997fc65bfe106172c4604fa88e9772 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Wed, 22 Apr 2026 16:28:11 +0200
Subject: [PATCH 11/27] erase useless diffs

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 docs/getting-started/benchmark.md      |   8 --
 tests/e2e/test_successful_benchmark.py | 184 -------------------------
 2 files changed, 192 deletions(-)

diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
index d5a8ed11d..91edc3ea2 100644
--- a/docs/getting-started/benchmark.md
+++ b/docs/getting-started/benchmark.md
@@ -82,14 +82,6 @@ For example, setting `--max-requests 1000` with `--profile sweep` will run 1000
 
 GuideLLM supports several benchmark profiles and strategies, which are described in detail below.
 
-- `synchronous`: Runs requests one at a time (sequential)
-- `throughput`: Tests maximum throughput by running requests in parallel
-- `concurrent`: Runs a fixed number of parallel request streams
-- `constant`: Sends requests at a fixed rate per second
-- `poisson`: Sends requests following a Poisson distribution
-- `sweep`: Automatically determines optimal performance points (default)
-- `replay`: Replays requests from a trace file to reproduce real-world traffic patterns (beta)
-
 #### Synchronous Profile
 
 Runs requests one at a time (sequential).
diff --git a/tests/e2e/test_successful_benchmark.py b/tests/e2e/test_successful_benchmark.py
index 0de80f91f..8703882b2 100644
--- a/tests/e2e/test_successful_benchmark.py
+++ b/tests/e2e/test_successful_benchmark.py
@@ -120,187 +120,3 @@ def test_max_requests_benchmark(server: VllmSimServer, tmp_path: Path):
         f"Expected {max_requests} successful requests, got {len(successful_requests)}"
     )
     assert_successful_requests_fields(successful_requests)
-
-
-@pytest.mark.timeout(30)
-@pytest.mark.sanity
-def test_replay_profile_benchmark(server: VllmSimServer, tmp_path: Path):
-    """
-    Test trace replay profile with a simple trace file.
-    Validates that requests are replayed with correct timing from trace.
-    Also tests time_scale (rate) functionality.
-    """
-    report_name = "replay_benchmarks.json"
-    report_path = tmp_path / report_name
-
-    # Create trace file with 5 requests at 0.05s intervals
-    trace_file = _create_trace_file(tmp_path, num_requests=5, interval=0.05)
-
-    # Create and configure the guidellm client with replay profile
-    client = GuidellmClient(
-        target=server.get_url(),
-        output_dir=tmp_path,
-        outputs=report_name,
-    )
-
-    # Start the benchmark with replay profile
-    # rate=2.0 means time_scale=2.0 (timestamps multiplied by 2)
-    client.start_benchmark(
-        profile="replay",
-        rate=2.0,
-        max_requests=5,
-        data=str(trace_file),
-        processor="gpt2",
-    )
-
-    # Wait for the benchmark to complete
-    client.wait_for_completion(timeout=30)
-
-    # Assert no Python exceptions occurred
-    assert_no_python_exceptions(client.stderr)
-
-    # Load and validate the report
-    report = load_benchmark_report(report_path)
-    assert len(report["benchmarks"]) == 1
-
-    benchmark = report["benchmarks"][0]
-
-    # Validate successful requests have all expected fields
-    successful_requests = benchmark["requests"]["successful"]
-    assert len(successful_requests) == 5, (
-        f"Expected 5 successful requests, got {len(successful_requests)}"
-    )
-    assert_successful_requests_fields(successful_requests)
-
-    # Verify scheduler state shows correct request count
-    assert "scheduler_state" in benchmark
-    scheduler_state = benchmark["scheduler_state"]
-    assert scheduler_state["processed_requests"] == 5
-
-
-@pytest.mark.timeout(30)
-@pytest.mark.sanity
-def test_replay_profile_max_requests_stronger_than_max_seconds(
-    server: VllmSimServer, tmp_path: Path
-):
-    """
-    Test replay profile where max_requests is the limiting constraint.
-    Trace has 20 requests over 2 seconds, but max_requests=5 limits to 5.
-    max_seconds=10 is not reached because max_requests triggers first.
-    """
-    report_name = "replay_max_requests_stronger.json"
-    report_path = tmp_path / report_name
-
-    # Create trace with 20 requests at 0.1s intervals (total 1.9s)
-    trace_file = _create_trace_file(tmp_path, num_requests=20, interval=0.1)
-
-    client = GuidellmClient(
-        target=server.get_url(),
-        output_dir=tmp_path,
-        outputs=report_name,
-    )
-
-    # max_requests=5 should be the limiting constraint
-    # max_seconds=10 should NOT be reached
-    client.start_benchmark(
-        profile="replay",
-        rate=1.0,
-        max_requests=5,
-        max_seconds=10,  # Very high, won't be reached
-        data=str(trace_file),
-        processor="gpt2",
-    )
-
-    client.wait_for_completion(timeout=30)
-    assert_no_python_exceptions(client.stderr)
-
-    report = load_benchmark_report(report_path)
-    benchmark = report["benchmarks"][0]
-
-    # Should only have 5 requests (max_requests won)
-    successful_requests = benchmark["requests"]["successful"]
-    assert len(successful_requests) == 5, (
-        f"Expected 5 requests (max_requests limit), got {len(successful_requests)}"
-    )
-
-    # Verify max_requests constraint was triggered
-    assert_constraint_triggered(benchmark, "max_requests", {"processed_exceeded": True})
-
-
-@pytest.mark.timeout(30)
-@pytest.mark.sanity
-def test_replay_profile_max_seconds_stronger_than_max_requests(
-    server: VllmSimServer, tmp_path: Path
-):
-    """
-    Test replay profile where max_seconds is the limiting constraint.
-    Trace has 20 requests over 2 seconds, but max_seconds=0.25 stops the replay
-    before max_requests=10 can be reached.
-    """
-    report_name = "replay_max_seconds_stronger.json"
-    report_path = tmp_path / report_name
-
-    # Create trace with 20 requests at 0.1s intervals
-    trace_file = _create_trace_file(tmp_path, num_requests=20, interval=0.1)
-
-    client = GuidellmClient(
-        target=server.get_url(),
-        output_dir=tmp_path,
-        outputs=report_name,
-    )
-
-    # max_seconds=0.25 should be the limiting runtime constraint
-    client.start_benchmark(
-        profile="replay",
-        rate=1.0,
-        max_requests=10,  # High, won't be reached
-        max_seconds=0.25,
-        data=str(trace_file),
-        processor="gpt2",
-    )
-
-    client.wait_for_completion(timeout=30)
-    assert_no_python_exceptions(client.stderr)
-
-    report = load_benchmark_report(report_path)
-    benchmark = report["benchmarks"][0]
-
-    successful_requests = benchmark["requests"]["successful"]
-    assert 0 < len(successful_requests) < 10, (
-        "Expected max_seconds to stop replay before max_requests was reached, "
-        f"got {len(successful_requests)} successful requests"
-    )
-
-    # Verify the runtime max_seconds constraint was triggered
-    assert_constraint_triggered(benchmark, "max_seconds", {"duration_exceeded": True})
-
-
-# Helper functions for trace file creation
-
-
-def _create_trace_file(
-    tmp_path: Path, num_requests: int = 5, interval: float = 0.1
-) -> Path:
-    """Create a trace file with evenly spaced timestamps for testing."""
-    trace_file = tmp_path / "trace.jsonl"
-    lines = [
-        f'{{"timestamp": {i * interval}, '
-        f'"input_length": {10 * (i + 1)}, '
-        f'"output_length": {5 * (i + 1)}}}'
-        for i in range(num_requests)
-    ]
-    trace_file.write_text("\n".join(lines))
-    return trace_file
-
-
-def _create_burst_trace_file(tmp_path: Path, num_requests: int = 10) -> Path:
-    """Create a trace file with all requests at the same timestamp."""
-    trace_file = tmp_path / "trace_burst.jsonl"
-    lines = [
-        f'{{"timestamp": 0.0, '
-        f'"input_length": {20 * (i + 1)}, '
-        f'"output_length": {10 * (i + 1)}}}'
-        for i in range(num_requests)
-    ]
-    trace_file.write_text("\n".join(lines))
-    return trace_file

From 7d76d5f62945119a80e2a25765ef9afb6bd31575 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Wed, 22 Apr 2026 16:45:01 +0200
Subject: [PATCH 12/27] fix trace replay tests for multiprocessing context

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 tests/unit/scheduler/test_trace_replay.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
index f4302b645..3f1b65d25 100644
--- a/tests/unit/scheduler/test_trace_replay.py
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -13,6 +13,7 @@
 import asyncio
 import json
 import math
+from multiprocessing import get_context
 from pathlib import Path
 
 import pytest
@@ -171,7 +172,11 @@ def test_next_request_time_scaled_timestamps(self):
             relative_timestamps=[0.0, 0.5, 1.0],
             time_scale=2.0,
         )
-        strategy.init_processes_timings(worker_count=1, max_concurrency=10)
+        strategy.init_processes_timings(
+            worker_count=1,
+            max_concurrency=10,
+            mp_context=get_context(),
+        )
         strategy.init_processes_start(1000.0)
         expected = [1000.0, 1001.0, 1002.0]
 
@@ -194,7 +199,11 @@ def test_next_request_time_beyond_trace_parks_worker(self):
             relative_timestamps=[0.0, 1.0],
             time_scale=1.0,
         )
-        strategy.init_processes_timings(worker_count=1, max_concurrency=10)
+        strategy.init_processes_timings(
+            worker_count=1,
+            max_concurrency=10,
+            mp_context=get_context(),
+        )
         strategy.init_processes_start(500.0)
 
         async def run():
@@ -239,7 +248,11 @@ def test_scheduling_patterns(self, timestamps, expected):
             relative_timestamps=timestamps,
             time_scale=1.0,
         )
-        strategy.init_processes_timings(worker_count=3, max_concurrency=10)
+        strategy.init_processes_timings(
+            worker_count=3,
+            max_concurrency=10,
+            mp_context=get_context(),
+        )
         strategy.init_processes_start(1000.0)
 
         async def run():

From 8a7addebb73917a4f620c887aa90de696e6088bc Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Wed, 22 Apr 2026 17:34:26 +0200
Subject: [PATCH 13/27] fix ruff issue

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/utils/trace_io.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/guidellm/utils/trace_io.py b/src/guidellm/utils/trace_io.py
index 12f76c3f6..4affdd704 100644
--- a/src/guidellm/utils/trace_io.py
+++ b/src/guidellm/utils/trace_io.py
@@ -60,9 +60,7 @@ def load_trace_rows(
     if max_rows is not None:
         if max_rows < 1:
             return trace_dataset.select([])
-        trace_dataset = trace_dataset.select(
-            range(min(max_rows, len(trace_dataset)))
-        )
+        trace_dataset = trace_dataset.select(range(min(max_rows, len(trace_dataset))))
 
     return trace_dataset
 

From a6126fb1fe69e9f036b0cfce3d460079110d5617 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Thu, 23 Apr 2026 12:21:38 +0200
Subject: [PATCH 14/27] refactor trace_synthetic and trace_io: remove max_rows;
 use data_samples as sole trace row cap

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/data/deserializers/trace_synthetic.py | 12 +-----------
 src/guidellm/utils/trace_io.py                     | 10 ----------
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/src/guidellm/data/deserializers/trace_synthetic.py b/src/guidellm/data/deserializers/trace_synthetic.py
index ab9849d5d..8b6de1e9b 100644
--- a/src/guidellm/data/deserializers/trace_synthetic.py
+++ b/src/guidellm/data/deserializers/trace_synthetic.py
@@ -58,7 +58,6 @@ def _load_trace_rows(
     timestamp_column: str,
     prompt_tokens_column: str,
     output_tokens_column: str,
-    max_rows: int | None = None,
 ) -> list[dict[str, Any]]:
     """Load trace file into list of dicts with timestamp, prompt_tokens,
     output_tokens."""
@@ -70,7 +69,6 @@ def _load_trace_rows(
                 prompt_tokens_column,
                 output_tokens_column,
             ],
-            max_rows=max_rows,
         )
     except (KeyError, ValueError) as e:
         raise DataNotSupportedError(str(e)) from e
@@ -117,16 +115,8 @@ def __call__(
         output_tokens_column = str(
             data_kwargs.pop("output_tokens_column", "output_length")
         )
-        max_rows_val = data_kwargs.pop("max_rows", None)
-        max_rows: int | None = None
-        if max_rows_val is not None:
-            if isinstance(max_rows_val, int):
-                max_rows = max_rows_val
-            elif isinstance(max_rows_val, str):
-                max_rows = int(max_rows_val)
-
         rows = _load_trace_rows(
-            path, timestamp_column, prompt_tokens_column, output_tokens_column, max_rows
+            path, timestamp_column, prompt_tokens_column, output_tokens_column
         )
         if not rows:
             raise DataNotSupportedError("Trace file is empty")
diff --git a/src/guidellm/utils/trace_io.py b/src/guidellm/utils/trace_io.py
index 4affdd704..203523fa3 100644
--- a/src/guidellm/utils/trace_io.py
+++ b/src/guidellm/utils/trace_io.py
@@ -19,7 +19,6 @@
 def load_trace_rows(
     path: Path | str,
     required_columns: list[str] | None = None,
-    max_rows: int | None = None,
     **data_kwargs: Any,
 ) -> Dataset:
     """
@@ -28,14 +27,10 @@ def load_trace_rows(
     Supports .jsonl only (one JSON object per line).
     If required_columns is set, every column must exist in the dataset;
     otherwise KeyError is raised with a descriptive message.
-    If max_rows is set, only the first max_rows rows are returned (for replay
-    with a request limit).
 
     :param path: Path to the trace file.
     :param required_columns: Optional list of column/field names that each row
         must have.
-    :param max_rows: Optional maximum number of rows to load; None means load all.
-        If set to a value less than 1, returns an empty Dataset.
     :param data_kwargs: Additional keyword arguments forwarded to load_dataset.
     :return: HuggingFace Dataset (iterable as dicts, column-accessible).
     :raises KeyError: If a required column is missing in the dataset.
@@ -57,11 +52,6 @@ def load_trace_rows(
         if missing:
             raise KeyError(f"Trace row missing required columns: {missing}")
 
-    if max_rows is not None:
-        if max_rows < 1:
-            return trace_dataset.select([])
-        trace_dataset = trace_dataset.select(range(min(max_rows, len(trace_dataset))))
-
     return trace_dataset
 
 

From ba792eb9bd3574a0d70e490afefef1a931780d0d Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sat, 25 Apr 2026 22:56:13 +0200
Subject: [PATCH 15/27] fix replay profile data sample handling

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/entrypoints.py       | 6 +-----
 tests/unit/benchmark/test_replay_profile.py | 1 +
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index 91344768d..79a1b2614 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -529,10 +529,6 @@ async def benchmark_generative_text(
             status="success",
         )
 
-    profile_kwargs: dict[str, Any] = {}
-    if args.profile == "replay":
-        profile_kwargs["data_samples"] = request_loader.info.get("data_samples", -1)
-
     profile = await resolve_profile(
         profile=args.profile,
         rate=args.rate,
@@ -547,7 +543,7 @@ async def benchmark_generative_text(
         over_saturation=args.over_saturation,
         console=console,
         data=args.data,
-        **profile_kwargs,
+        data_samples=request_loader.info.get("data_samples", -1),
     )
     output_formats = await resolve_output_formats(
         outputs=args.outputs, output_dir=args.output_dir, console=console
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index dea074d27..d799cf9e1 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -10,6 +10,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+
 import pytest
 
 from guidellm.benchmark.profiles import Profile, ReplayProfile

From fc524d2642aa1868f1655f668cd171b90ee00f1e Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sat, 25 Apr 2026 23:45:54 +0200
Subject: [PATCH 16/27] test: restore e2e utils

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 tests/e2e/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 03f039ef2..55baa89d2 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -45,12 +45,12 @@ def __init__(
     def start_benchmark(
         self,
         profile: str = "constant",
-        rate: int | float = 10,
-        max_seconds: int | float | None = None,
+        rate: int = 10,
+        max_seconds: int | None = None,
         max_requests: int | None = None,
         max_error_rate: float | None = None,
         over_saturation: dict[str, Any] | None = None,
-        data: str | Path = "prompt_tokens=256,output_tokens=128",
+        data: str = "prompt_tokens=256,output_tokens=128",
         processor: str = "gpt2",
         additional_args: str = "",
         extra_env: dict[str, str] | None = None,
@@ -59,13 +59,13 @@ def start_benchmark(
         Start a guidellm benchmark command.
 
         :param profile: Type of rate control (constant, etc.)
-        :param rate: Request rate (or time_scale for replay profile)
+        :param rate: Request rate
         :param max_seconds: Maximum duration in seconds
         :param max_requests: Maximum number of requests
         :param max_error_rate: Maximum error rate before stopping
         :param over_saturation: Over-saturation detection configuration (dict).
             Passed as JSON string to --over-saturation CLI argument.
-        :param data: Data configuration string or Path to trace file for replay profile
+        :param data: Data configuration string
         :param processor: Processor/tokenizer to use
         :param additional_args: Additional command line arguments
         :param extra_env: Additional environment variables to set
@@ -109,7 +109,7 @@ def start_benchmark(
 
         cmd_parts.extend(
             [
-                f'--data "{str(data)}"',
+                f'--data "{data}"',
                 f'--processor "{processor}"',
                 f"--output-dir {self.output_dir}",
                 f"--outputs {self.outputs}",

From c0150d0c235d43c18f810a164bf59c9ae17ba7ab Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sun, 26 Apr 2026 10:41:54 +0200
Subject: [PATCH 17/27] fix trace replay ordering alignment

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/data/deserializers/trace_synthetic.py   |  2 +-
 src/guidellm/utils/trace_io.py                       | 12 ++++++++++--
 .../unit/data/deserializers/test_trace_synthetic.py  |  4 ++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/guidellm/data/deserializers/trace_synthetic.py b/src/guidellm/data/deserializers/trace_synthetic.py
index 8b6de1e9b..5aa3f25c3 100644
--- a/src/guidellm/data/deserializers/trace_synthetic.py
+++ b/src/guidellm/data/deserializers/trace_synthetic.py
@@ -65,10 +65,10 @@ def _load_trace_rows(
         raw = load_trace_rows(
             path,
             required_columns=[
-                timestamp_column,
                 prompt_tokens_column,
                 output_tokens_column,
             ],
+            timestamp_column=timestamp_column,
         )
     except (KeyError, ValueError) as e:
         raise DataNotSupportedError(str(e)) from e
diff --git a/src/guidellm/utils/trace_io.py b/src/guidellm/utils/trace_io.py
index 203523fa3..76c4362ca 100644
--- a/src/guidellm/utils/trace_io.py
+++ b/src/guidellm/utils/trace_io.py
@@ -19,6 +19,7 @@
 def load_trace_rows(
     path: Path | str,
     required_columns: list[str] | None = None,
+    timestamp_column: str | None = None,
     **data_kwargs: Any,
 ) -> Dataset:
     """
@@ -31,6 +32,7 @@ def load_trace_rows(
     :param path: Path to the trace file.
     :param required_columns: Optional list of column/field names that each row
         must have.
+    :param timestamp_column: Optional timestamp column used to order trace rows.
     :param data_kwargs: Additional keyword arguments forwarded to load_dataset.
     :return: HuggingFace Dataset (iterable as dicts, column-accessible).
     :raises KeyError: If a required column is missing in the dataset.
@@ -47,11 +49,18 @@ def load_trace_rows(
         "json", data_files=str(path), split="train", **data_kwargs
     )
 
+    required_columns = required_columns or []
+    if timestamp_column and timestamp_column not in required_columns:
+        required_columns = [*required_columns, timestamp_column]
+
     if required_columns:
         missing = [c for c in required_columns if c not in trace_dataset.column_names]
         if missing:
             raise KeyError(f"Trace row missing required columns: {missing}")
 
+    if timestamp_column:
+        trace_dataset = trace_dataset.sort(timestamp_column)
+
     return trace_dataset
 
 
@@ -71,10 +80,9 @@ def load_relative_timestamps(
     :return: List of relative timestamps in seconds (first is 0.0, always sorted).
     :raises ValueError: If the trace file is empty or has no valid rows.
     """
-    trace_dataset = load_trace_rows(path, required_columns=[timestamp_column])
+    trace_dataset = load_trace_rows(path, timestamp_column=timestamp_column)
     if len(trace_dataset) == 0:
         raise ValueError(f"Trace file is empty or has no valid rows: {path}")
-    trace_dataset = trace_dataset.sort(timestamp_column)
     timestamps = [float(t) for t in trace_dataset[timestamp_column]]
     t0 = timestamps[0]
     return [t - t0 for t in timestamps]
diff --git a/tests/unit/data/deserializers/test_trace_synthetic.py b/tests/unit/data/deserializers/test_trace_synthetic.py
index 4c5462d24..f6610969d 100644
--- a/tests/unit/data/deserializers/test_trace_synthetic.py
+++ b/tests/unit/data/deserializers/test_trace_synthetic.py
@@ -72,13 +72,13 @@ def deserializer(self):
                 '{"timestamp": 0.3, "input_length": 131072, "output_length": 32768}\n',
                 [(10, 5), (65536, 16384), (20, 10), (131072, 32768)],
             ),
-            # Unsorted timestamps with duplicates (preserves file order)
+            # Unsorted timestamps with duplicates (sorts by timestamp)
             (
                 '{"timestamp": 5.0, "input_length": 100, "output_length": 10}\n'
                 '{"timestamp": 2.0, "input_length": 200, "output_length": 20}\n'
                 '{"timestamp": 8.0, "input_length": 300, "output_length": 30}\n'
                 '{"timestamp": 2.0, "input_length": 400, "output_length": 40}\n',
-                [(100, 10), (200, 20), (300, 30), (400, 40)],
+                [(200, 20), (400, 40), (100, 10), (300, 30)],
             ),
             # Concurrent burst (5 requests at same timestamp)
             (

From b532df99c58af427fefcabaee2c19b4cbadda4ff Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sun, 26 Apr 2026 19:16:31 +0200
Subject: [PATCH 18/27] Fix trace replay alignment and semantics across
 loading, scheduling, and docs

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 docs/getting-started/benchmark.md           | 13 +++--
 docs/guides/datasets.md                     | 18 +++++-
 src/guidellm/benchmark/entrypoints.py       |  1 +
 src/guidellm/benchmark/profiles.py          | 14 ++++-
 src/guidellm/scheduler/strategies.py        |  6 +-
 tests/unit/benchmark/test_replay_profile.py | 64 +++++++++++++++++++++
 tests/unit/scheduler/test_trace_replay.py   |  7 +--
 7 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
index 91edc3ea2..dd3c29d15 100644
--- a/docs/getting-started/benchmark.md
+++ b/docs/getting-started/benchmark.md
@@ -65,6 +65,7 @@ GuideLLM offers a wide range of configuration options to customize your benchmar
 | `--random-seed`  | Random seed for reproducibility                | `--random-seed 42`                             |
 | `--max-seconds`  | Duration for each benchmark in seconds         | `--max-seconds 30`                             |
 | `--max-requests` | Maximum number of requests for each benchmark  | `--max-requests 1000`                          |
+| `--data-samples` | Maximum number of dataset rows to load         | `--data-samples 1000`                          |
 | `--output-dir`   | Directory path to save output files            | `--output-dir results/`                        |
 | `--outputs`      | Output formats to generate                     | `--outputs json csv html`                      |
 
@@ -189,25 +190,29 @@ You can customize synthetic data generation with additional parameters such as s
 
 ### Trace Replay Benchmarking (beta)
 
-For realistic load testing, replay traffic patterns from trace files. Trace files must be JSONL with `timestamp`, `input_length`, and `output_length` fields:
+For realistic load testing, replay trace events using each row's timestamp and token lengths. Trace files must be JSONL and are loaded with the `trace_synthetic` data type. By default, each row uses `timestamp`, `input_length`, and `output_length` fields:
 
 ```json
 {"timestamp": 0, "input_length": 256, "output_length": 128}
 {"timestamp": 0.5, "input_length": 512, "output_length": 64}
 ```
 
-Run with the `replay` profile:
+Run with the `replay` profile. This example also maps custom trace column names:
 
 ```bash
 guidellm benchmark \
   --target "http://localhost:8000" \
   --data "path/to/trace.jsonl" \
-  --data-args '{"type_": "trace_synthetic"}' \
+  --data-args '{"type_": "trace_synthetic", "timestamp_column": "ts", "prompt_tokens_column": "input_tokens", "output_tokens_column": "output_tokens"}' \
   --profile replay \
   --rate 1.0
 ```
 
-The `rate` parameter acts as a time scale: `1.0` for original speed, `2.0` for 2x faster, `0.5` for half speed.
+The `--rate` parameter acts as a time scale, not requests per second: `1.0` for original speed, `2.0` to multiply timestamps by 2 and run twice as long, `0.5` to multiply timestamps by 0.5 and run twice as fast.
+
+GuideLLM orders trace rows by timestamp before scheduling and payload generation, so each scheduled event uses the token lengths from the same sorted row. Use `--data-samples` to limit how many trace rows are loaded and replayed. `--max-requests` remains a runtime completion constraint; it does not truncate the trace dataset.
+
+If your trace uses the default column names shown above, omit `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column` from `--data-args`.
 
 ### Working with Real Data
 
diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md
index 9c80d8edc..a181552b8 100644
--- a/docs/guides/datasets.md
+++ b/docs/guides/datasets.md
@@ -13,6 +13,10 @@ The following arguments can be used to configure datasets and their processing:
   - `prompt_column`: Specifies the column name for the prompt. By default, GuideLLM will try the most common column names (e.g., `prompt`, `text`, `input`).
   - `prompt_tokens_count_column`: Specifies the column name for the prompt token count. These are used to set the request prompt token count for counting metrics. By default, GuideLLM assumes no token count is provided.
   - `output_tokens_count_column`: Specifies the column name for the output token count. These are used to set the request output token count for the request and counting metrics. By default, GuideLLM assumes no token count is provided.
+  - `type_`: Selects a specialized dataset deserializer, such as `trace_synthetic` for trace replay files.
+  - `timestamp_column`: Specifies the timestamp column for `trace_synthetic` data. The default is `timestamp`.
+  - `prompt_tokens_column`: Specifies the prompt token length column for `trace_synthetic` data. The default is `input_length`.
+  - `output_tokens_column`: Specifies the output token length column for `trace_synthetic` data. The default is `output_length`.
   - `split`: Specifies the dataset split to use (e.g., `train`, `val`, `test`). By default, GuideLLM will try the most common split names (e.g., `train`, `validation`, `test`) if the dataset has splits, otherwise it will use the entire dataset.
   - Any remaining arguments are passed directly into the dataset constructor as kwargs.
 - `--data-sampler`: Specifies the sampling strategy for datasets. By default, no sampling is applied. When set to `random`, it enables random shuffling of the dataset, which can be useful for creating diverse batches during benchmarking.
@@ -131,11 +135,23 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
   {"prompt": "Hello, how are you?", "output_tokens_count": 5, "additional_column": "foo", "additional_column2": "bar"}
   {"prompt": "What is your name?", "output_tokens_count": 3, "additional_column": "baz", "additional_column2": "qux"}
   ```
-- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to reproduce production traffic patterns. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
+- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to replay trace events using each row's timestamp and token lengths. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
   ```json
   {"timestamp": 0, "input_length": 256, "output_length": 128}
   {"timestamp": 0.5, "input_length": 512, "output_length": 64}
   ```
+  Trace rows are ordered by timestamp before GuideLLM schedules requests and generates synthetic payloads. This keeps each scheduled event aligned with the prompt and output token lengths from the same row.
+
+  Use `--data-args '{"type_": "trace_synthetic"}'` to enable trace loading. If your trace uses different column names, configure them with `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column`:
+  ```bash
+  guidellm benchmark \
+      --target "http://localhost:8000" \
+      --profile replay \
+      --rate 1.0 \
+      --data "path/to/trace.jsonl" \
+      --data-args '{"type_": "trace_synthetic", "timestamp_column": "ts", "prompt_tokens_column": "input_tokens", "output_tokens_column": "output_tokens"}'
+  ```
+  For replay, `--rate` is a time scale rather than requests per second. Use `--data-samples` to limit how many trace rows are loaded and replayed. Use `--max-requests` only as a runtime completion constraint; it does not limit the trace rows loaded from the file.
 - **JSON files (`.json`)**: Where the entire dataset is represented as a JSON array of objects nested under a specific key. To surface the correct key to use, a `--data-column-mapper` argument must be passed in of `"field": "NAME"` for where the array exists. The objects should include `prompt` or other common names for the prompt which will be used as the prompt column. Additional fields can be included based on the previously mentioned aliases for the `--data-column-mapper` argument.
   ```json
   {
diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index 79a1b2614..5ce7c9ed7 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -543,6 +543,7 @@ async def benchmark_generative_text(
         over_saturation=args.over_saturation,
         console=console,
         data=args.data,
+        data_args=args.data_args,
         data_samples=request_loader.info.get("data_samples", -1),
     )
     output_formats = await resolve_output_formats(
diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index 6c31cafa8..ab3d5ceed 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -376,7 +376,19 @@ def resolve_args(
         # second)
         time_scale = rate[0] if rate and len(rate) > 0 else 1.0
 
-        relative_timestamps = load_relative_timestamps(path)
+        # Honor a custom timestamp column when configured via --data-args so the
+        # scheduler and the trace_synthetic deserializer use the same field.
+        data_args = kwargs.get("data_args") or []
+        first_args = data_args[0] if data_args else {}
+        timestamp_column = "timestamp"
+        if isinstance(first_args, dict):
+            raw_timestamp_column = first_args.get("timestamp_column")
+            if isinstance(raw_timestamp_column, str) and raw_timestamp_column.strip():
+                timestamp_column = raw_timestamp_column
+
+        relative_timestamps = load_relative_timestamps(
+            path, timestamp_column=timestamp_column
+        )
         data_samples = kwargs.get("data_samples", -1)
         if isinstance(data_samples, int) and data_samples > 0:
             relative_timestamps = relative_timestamps[:data_samples]
diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
index 316a532a4..90be8ce45 100644
--- a/src/guidellm/scheduler/strategies.py
+++ b/src/guidellm/scheduler/strategies.py
@@ -702,9 +702,9 @@ def processes_limit(self) -> PositiveInt | None:
 
     @property
     def requests_limit(self) -> PositiveInt | None:
-        # Cap concurrency to the trace length so workers never hold more
-        # semaphore slots than there are items to process.
-        return len(self.relative_timestamps) if self.relative_timestamps else None
+        # Concurrency is governed by settings.max_concurrency, backend limits, or
+        # user-provided strategy limits; trace length is not a concurrency cap.
+        return None
 
     async def next_request_time(self, worker_index: NonNegativeInt) -> float:
         _ = worker_index
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index d799cf9e1..355282ac0 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -217,6 +217,70 @@ def test_constraints_remain_runtime_only(self, tmp_path: Path):
         assert kwargs["constraints"] == {"max_requests": 2, "max_seconds": 1.5}
         assert kwargs["time_scale"] == 2.0
 
+    @pytest.mark.smoke
+    def test_custom_timestamp_column_via_data_args(self, tmp_path: Path):
+        """data_args[0]["timestamp_column"] is honored by ReplayProfile."""
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"ts": 5.0, "input_length": 100, "output_length": 10}',
+                '{"ts": 2.0, "input_length": 200, "output_length": 20}',
+                '{"ts": 8.0, "input_length": 300, "output_length": 30}',
+            ],
+        )
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+            data_args=[{"timestamp_column": "ts"}],
+        )
+        assert kwargs["relative_timestamps"] == pytest.approx(
+            [0.0, 3.0, 6.0], abs=1e-9
+        )
+
+    @pytest.mark.sanity
+    def test_default_timestamp_column_when_data_args_missing(self, tmp_path: Path):
+        """Without data_args, replay falls back to the default `timestamp` column."""
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
+            ],
+        )
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+        )
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0], abs=1e-9)
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize("invalid_value", [None, "", "   ", 123, 1.5, False, []])
+    def test_invalid_timestamp_column_falls_back_to_default(
+        self,
+        tmp_path: Path,
+        invalid_value: object,
+    ):
+        """Invalid timestamp_column values fall back to default `timestamp`."""
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"timestamp": 10.0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 12.0, "input_length": 2, "output_length": 2}',
+            ],
+        )
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+            data_args=[{"timestamp_column": invalid_value}],
+        )
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 2.0], abs=1e-9)
+
     @pytest.mark.smoke
     def test_data_samples_and_constraints_are_independent(self, tmp_path: Path):
         """data_samples truncates timestamps without mutating runtime constraints."""
diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
index 3f1b65d25..4058cfb5b 100644
--- a/tests/unit/scheduler/test_trace_replay.py
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -1,5 +1,3 @@
-## WRITTEN BY AI ##
-
 """
 Unit tests for trace replay strategy and load_relative_timestamps.
 
@@ -144,8 +142,9 @@ def test_initialization_and_str(self, timestamps, time_scale):
         assert strategy.relative_timestamps == timestamps
         assert strategy.time_scale == time_scale
         assert strategy.processes_limit is None
-        # requests_limit equals trace length to cap concurrency to available requests
-        assert strategy.requests_limit == len(timestamps)
+        # Trace length must not become the scheduler concurrency cap; concurrency
+        # is governed by settings/backend/user-provided limits.
+        assert strategy.requests_limit is None
         if time_scale == 0.5:
             assert str(strategy) == "trace@0.50"
 

From e3f317dc79b6dae90ca2aad320be2913a2279ee6 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sun, 26 Apr 2026 19:16:31 +0200
Subject: [PATCH 19/27] Fix trace replay alignment and semantics across
 loading, scheduling, and docs

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/scheduler/strategies.py      | 6 +++---
 tests/unit/scheduler/test_trace_replay.py | 4 +---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
index 90be8ce45..a38e61974 100644
--- a/src/guidellm/scheduler/strategies.py
+++ b/src/guidellm/scheduler/strategies.py
@@ -702,9 +702,9 @@ def processes_limit(self) -> PositiveInt | None:
 
     @property
     def requests_limit(self) -> PositiveInt | None:
-        # Concurrency is governed by settings.max_concurrency, backend limits, or
-        # user-provided strategy limits; trace length is not a concurrency cap.
-        return None
+        if not self.relative_timestamps:
+            return None
+        return len(self.relative_timestamps)
 
     async def next_request_time(self, worker_index: NonNegativeInt) -> float:
         _ = worker_index
diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
index 4058cfb5b..37a14b614 100644
--- a/tests/unit/scheduler/test_trace_replay.py
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -142,9 +142,7 @@ def test_initialization_and_str(self, timestamps, time_scale):
         assert strategy.relative_timestamps == timestamps
         assert strategy.time_scale == time_scale
         assert strategy.processes_limit is None
-        # Trace length must not become the scheduler concurrency cap; concurrency
-        # is governed by settings/backend/user-provided limits.
-        assert strategy.requests_limit is None
+        assert strategy.requests_limit == len(timestamps)
         if time_scale == 0.5:
             assert str(strategy) == "trace@0.50"
 

From abfa41c7f5b79f378c2d553c3d18ac41f4aa11ef Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Mon, 27 Apr 2026 10:54:46 +0200
Subject: [PATCH 20/27] refactor unit tests: strengthen trace replay unit
 coverage

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 .../data/deserializers/trace_synthetic.py     |  22 +-
 tests/unit/benchmark/test_replay_profile.py   | 330 +++++++-----------
 .../deserializers/test_trace_synthetic.py     | 197 ++++++-----
 tests/unit/scheduler/test_trace_replay.py     | 256 +++++---------
 4 files changed, 332 insertions(+), 473 deletions(-)

diff --git a/src/guidellm/data/deserializers/trace_synthetic.py b/src/guidellm/data/deserializers/trace_synthetic.py
index 5aa3f25c3..43146c1b0 100644
--- a/src/guidellm/data/deserializers/trace_synthetic.py
+++ b/src/guidellm/data/deserializers/trace_synthetic.py
@@ -12,6 +12,7 @@
 from typing import Any
 
 from datasets import Dataset
+from datasets.exceptions import DatasetGenerationError
 from faker import Faker
 from transformers import PreTrainedTokenizerBase
 
@@ -70,16 +71,19 @@ def _load_trace_rows(
             ],
             timestamp_column=timestamp_column,
         )
-    except (KeyError, ValueError) as e:
+    except (DatasetGenerationError, KeyError, ValueError) as e:
+        raise DataNotSupportedError(str(e)) from e
+    try:
+        return [
+            {
+                "timestamp": float(row[timestamp_column]),
+                "prompt_tokens": int(row[prompt_tokens_column]),
+                "output_tokens": int(row[output_tokens_column]),
+            }
+            for row in raw
+        ]
+    except (TypeError, ValueError) as e:
         raise DataNotSupportedError(str(e)) from e
-    return [
-        {
-            "timestamp": float(row[timestamp_column]),
-            "prompt_tokens": int(row[prompt_tokens_column]),
-            "output_tokens": int(row[output_tokens_column]),
-        }
-        for row in raw
-    ]
 
 
 @DatasetDeserializerFactory.register("trace_synthetic")
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index 355282ac0..b1ce75f2c 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -1,35 +1,25 @@
-## WRITTEN BY AI ##
-
-"""
-Unit tests for ReplayProfile.
-
-Ensures replay profile loads trace timestamps and creates TraceReplayStrategy with
-orrect time_scale.
-"""
-
 from __future__ import annotations
 
+import asyncio
 from pathlib import Path
 
 import pytest
+from pydantic import ValidationError
 
+from guidellm.benchmark.entrypoints import resolve_profile
 from guidellm.benchmark.profiles import Profile, ReplayProfile
 from guidellm.scheduler import TraceReplayStrategy
 
 
-def _trace_path(tmp_path: Path, lines: list[str]) -> Path:
-    """Write JSONL lines to a trace file and return its path."""
+def _trace_path(tmp_path: Path, lines: list[str] | None = None) -> Path:
     path = tmp_path / "trace.jsonl"
-    path.write_text("\n".join(lines))
+    path.write_text("\n".join(lines or []))
     return path
 
 
 class TestReplayProfile:
-    """Tests for ReplayProfile."""
-
     @pytest.mark.smoke
     def test_resolve_args_requires_data(self):
-        """resolve_args raises when data is missing."""
         with pytest.raises(ValueError, match="Replay profile requires data"):
             ReplayProfile.resolve_args(
                 rate_type="replay",
@@ -37,211 +27,147 @@ def test_resolve_args_requires_data(self):
                 random_seed=42,
             )
 
+    @pytest.mark.smoke
+    def test_resolve_args_rejects_missing_or_empty_trace(self, tmp_path: Path):
+        missing = tmp_path / "missing.jsonl"
+        with pytest.raises(ValueError, match="not found"):
+            ReplayProfile.resolve_args(
+                rate_type="replay",
+                rate=[1.0],
+                random_seed=42,
+                data=[str(missing)],
+            )
+
+        empty = _trace_path(tmp_path)
+        with pytest.raises(ValueError, match="empty|No timestamps"):
+            ReplayProfile.resolve_args(
+                rate_type="replay",
+                rate=[1.0],
+                random_seed=42,
+                data=[str(empty)],
+            )
+
     @pytest.mark.smoke
     @pytest.mark.parametrize(
-        ("trace_lines", "rate", "expected_ts", "expected_scale"),
+        ("rate", "expected_scale"),
         [
-            # Basic trace
-            (
-                [
-                    '{"timestamp": 0, "input_length": 10, "output_length": 5}',
-                    '{"timestamp": 0.5, "input_length": 20, "output_length": 10}',
-                ],
-                [2.0],
-                [0.0, 0.5],
-                2.0,
-            ),
-            # High token counts (8K-128K contexts)
-            (
-                [
-                    '{"timestamp": 0, "input_length": 8192, "output_length": 1024}',
-                    '{"timestamp": 0.5, "input_length": 32768, "output_length": 4096}',
-                    '{"timestamp": 1.0, "input_length": 131072,"output_length": 16384}',
-                ],
-                [1.0],
-                [0.0, 0.5, 1.0],
-                1.0,
-            ),
-            # Unsorted timestamps (sorted chronologically, all >= 0)
-            (
-                [
-                    '{"timestamp": 5.0, "input_length": 100, "output_length": 10}',
-                    '{"timestamp": 2.0, "input_length": 200, "output_length": 20}',
-                    '{"timestamp": 8.0, "input_length": 300, "output_length": 30}',
-                ],
-                [1.0],
-                [0.0, 3.0, 6.0],  # Sorted: 2.0, 5.0, 8.0 -> 0.0, 3.0, 6.0
-                1.0,
-            ),
-            # Duplicate timestamps (concurrent burst)
-            (
-                [
-                    '{"timestamp": 1.0, "input_length": 100, "output_length": 10}',
-                    '{"timestamp": 1.0, "input_length": 200, "output_length": 20}',
-                    '{"timestamp": 1.0, "input_length": 300, "output_length": 30}',
-                    '{"timestamp": 2.5, "input_length": 400, "output_length": 40}',
-                ],
-                [2.0],
-                [0.0, 0.0, 0.0, 1.5],
-                2.0,
-            ),
-            # High-frequency trace (millisecond-scale)
-            (
-                [
-                    '{"timestamp": 0.000, "input_length": 100, "output_length": 10}',
-                    '{"timestamp": 0.001, "input_length": 200, "output_length": 20}',
-                    '{"timestamp": 0.002, "input_length": 300, "output_length": 30}',
-                    '{"timestamp": 0.003, "input_length": 400, "output_length": 40}',
-                ],
-                [1.0],
-                [0.0, 0.001, 0.002, 0.003],
-                1.0,
-            ),
+            (None, 1.0),
+            ([2.0], 2.0),
         ],
     )
-    def test_resolve_args_and_create_with_trace(
-        self, tmp_path: Path, trace_lines, rate, expected_ts, expected_scale
+    def test_profile_create_resolves_timestamps_and_time_scale(
+        self, tmp_path: Path, rate, expected_scale
     ):
-        """resolve_args loads trace; Profile.create returns ReplayProfile with
-        correct time_scale."""
-        trace = _trace_path(tmp_path, trace_lines)
-        out = ReplayProfile.resolve_args(
-            rate_type="replay",
-            rate=rate,
-            random_seed=42,
-            data=[str(trace)],
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"timestamp": 5.0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 2.0, "input_length": 2, "output_length": 2}',
+                '{"timestamp": 8.0, "input_length": 3, "output_length": 3}',
+            ],
         )
-        assert out["relative_timestamps"] == pytest.approx(expected_ts, abs=1e-9)
-        assert out["time_scale"] == expected_scale
+
         profile = Profile.create(
             rate_type="replay",
             rate=rate,
             random_seed=42,
             data=[str(trace)],
         )
+
         assert isinstance(profile, ReplayProfile)
-        assert profile.relative_timestamps == pytest.approx(expected_ts, abs=1e-9)
+        assert profile.relative_timestamps == pytest.approx([0.0, 3.0, 6.0], abs=1e-9)
         assert profile.time_scale == expected_scale
 
-    @pytest.mark.smoke
-    def test_next_strategy_returns_trace_then_none(self, tmp_path: Path):
-        """next_strategy returns TraceReplayStrategy then None."""
+    @pytest.mark.sanity
+    def test_non_positive_time_scale_is_rejected(self, tmp_path: Path):
         trace = _trace_path(
             tmp_path,
             ['{"timestamp": 0, "input_length": 1, "output_length": 1}'],
         )
-        kwargs = ReplayProfile.resolve_args(
-            rate_type="replay",
-            rate=[1.0],
-            random_seed=42,
-            data=[str(trace)],
-        )
-        profile = ReplayProfile(**kwargs)
-        assert profile.strategy_types == ["trace"]
-        s1 = profile.next_strategy(None, None)
-        assert isinstance(s1, TraceReplayStrategy)
-        assert s1.relative_timestamps == [0.0]
-        assert s1.time_scale == 1.0
-        assert profile.next_strategy(s1, None) is None
+
+        with pytest.raises(ValidationError):
+            Profile.create(
+                rate_type="replay",
+                rate=[0.0],
+                random_seed=42,
+                data=[str(trace)],
+            )
 
     @pytest.mark.smoke
-    @pytest.mark.parametrize(
-        ("trace_lines", "data_samples", "expected_ts"),
-        [
-            (
-                [
-                    '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                    '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
-                    '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
-                    '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
-                ],
-                2,
-                [0.0, 1.0],
-            ),
-            (
-                [
-                    '{"timestamp": 1.0, "input_length": 100, "output_length": 10}',
-                    '{"timestamp": 1.0, "input_length": 200, "output_length": 20}',
-                    '{"timestamp": 1.0, "input_length": 300, "output_length": 30}',
-                    '{"timestamp": 1.0, "input_length": 400, "output_length": 40}',
-                    '{"timestamp": 1.0, "input_length": 500, "output_length": 50}',
-                ],
-                2,
-                [0.0, 0.0],
-            ),
-            (
-                [
-                    '{"timestamp": 5.0, "input_length": 100, "output_length": 10}',
-                    '{"timestamp": 2.0, "input_length": 200, "output_length": 20}',
-                    '{"timestamp": 8.0, "input_length": 300, "output_length": 30}',
-                    '{"timestamp": 1.0, "input_length": 400, "output_length": 40}',
-                ],
-                3,
-                [0.0, 1.0, 4.0],
-            ),
-        ],
-    )
-    def test_data_samples_truncates_timestamps(
-        self, tmp_path: Path, trace_lines, data_samples, expected_ts
-    ):
-        """data_samples truncates replay timestamps to match sampled dataset rows."""
-        trace = _trace_path(tmp_path, trace_lines)
+    def test_custom_timestamp_column_via_data_args(self, tmp_path: Path):
+        trace = _trace_path(
+            tmp_path,
+            [
+                '{"ts": 5.0, "input_length": 100, "output_length": 10}',
+                '{"ts": 2.0, "input_length": 200, "output_length": 20}',
+                '{"ts": 8.0, "input_length": 300, "output_length": 30}',
+            ],
+        )
+
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
             rate=[1.0],
             random_seed=42,
             data=[str(trace)],
-            data_samples=data_samples,
+            data_args=[{"timestamp_column": "ts"}],
         )
-        assert kwargs["relative_timestamps"] == pytest.approx(expected_ts, abs=1e-9)
+
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 3.0, 6.0], abs=1e-9)
 
     @pytest.mark.smoke
-    def test_constraints_remain_runtime_only(self, tmp_path: Path):
-        """Runtime constraints are preserved and do not filter replay timestamps."""
+    @pytest.mark.parametrize("invalid_value", [None, "", "   ", 123, False, []])
+    def test_invalid_timestamp_column_config_falls_back_to_default(
+        self, tmp_path: Path, invalid_value
+    ):
         trace = _trace_path(
             tmp_path,
             [
-                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                '{"timestamp": 0.5, "input_length": 2, "output_length": 2}',
-                '{"timestamp": 1.0, "input_length": 3, "output_length": 3}',
+                '{"timestamp": 10.0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 12.0, "input_length": 2, "output_length": 2}',
             ],
         )
+
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
-            rate=[2.0],
+            rate=[1.0],
             random_seed=42,
             data=[str(trace)],
-            constraints={"max_requests": 2, "max_seconds": 1.5},
+            data_args=[{"timestamp_column": invalid_value}],
         )
-        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 0.5, 1.0], abs=1e-9)
-        assert kwargs["constraints"] == {"max_requests": 2, "max_seconds": 1.5}
-        assert kwargs["time_scale"] == 2.0
+
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 2.0], abs=1e-9)
 
     @pytest.mark.smoke
-    def test_custom_timestamp_column_via_data_args(self, tmp_path: Path):
-        """data_args[0]["timestamp_column"] is honored by ReplayProfile."""
+    def test_data_samples_truncates_after_sorting_and_preserves_constraints(
+        self, tmp_path: Path
+    ):
         trace = _trace_path(
             tmp_path,
             [
-                '{"ts": 5.0, "input_length": 100, "output_length": 10}',
-                '{"ts": 2.0, "input_length": 200, "output_length": 20}',
-                '{"ts": 8.0, "input_length": 300, "output_length": 30}',
+                '{"timestamp": 5.0, "input_length": 1, "output_length": 1}',
+                '{"timestamp": 2.0, "input_length": 2, "output_length": 2}',
+                '{"timestamp": 8.0, "input_length": 3, "output_length": 3}',
+                '{"timestamp": 1.0, "input_length": 4, "output_length": 4}',
             ],
         )
+
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
             rate=[1.0],
             random_seed=42,
             data=[str(trace)],
-            data_args=[{"timestamp_column": "ts"}],
-        )
-        assert kwargs["relative_timestamps"] == pytest.approx(
-            [0.0, 3.0, 6.0], abs=1e-9
+            data_samples=3,
+            constraints={"max_requests": 10, "max_seconds": 0.25},
         )
 
-    @pytest.mark.sanity
-    def test_default_timestamp_column_when_data_args_missing(self, tmp_path: Path):
-        """Without data_args, replay falls back to the default `timestamp` column."""
+        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0, 4.0], abs=1e-9)
+        assert kwargs["constraints"] == {"max_requests": 10, "max_seconds": 0.25}
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize("data_samples", [0, -1])
+    def test_non_positive_data_samples_do_not_truncate(
+        self, tmp_path: Path, data_samples: int
+    ):
         trace = _trace_path(
             tmp_path,
             [
@@ -249,58 +175,68 @@ def test_default_timestamp_column_when_data_args_missing(self, tmp_path: Path):
                 '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
             ],
         )
+
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
             rate=[1.0],
             random_seed=42,
             data=[str(trace)],
+            data_samples=data_samples,
         )
+
         assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0], abs=1e-9)
 
-    @pytest.mark.sanity
-    @pytest.mark.parametrize("invalid_value", [None, "", "   ", 123, 1.5, False, []])
-    def test_invalid_timestamp_column_falls_back_to_default(
-        self,
-        tmp_path: Path,
-        invalid_value: object,
-    ):
-        """Invalid timestamp_column values fall back to default `timestamp`."""
+    @pytest.mark.smoke
+    def test_resolve_profile_passes_replay_specific_kwargs(self, tmp_path: Path):
         trace = _trace_path(
             tmp_path,
             [
-                '{"timestamp": 10.0, "input_length": 1, "output_length": 1}',
-                '{"timestamp": 12.0, "input_length": 2, "output_length": 2}',
+                '{"ts": 5.0, "input_length": 1, "output_length": 1}',
+                '{"ts": 2.0, "input_length": 2, "output_length": 2}',
+                '{"ts": 8.0, "input_length": 3, "output_length": 3}',
             ],
         )
-        kwargs = ReplayProfile.resolve_args(
-            rate_type="replay",
-            rate=[1.0],
-            random_seed=42,
-            data=[str(trace)],
-            data_args=[{"timestamp_column": invalid_value}],
+
+        profile = asyncio.run(
+            resolve_profile(
+                profile="replay",
+                rate=[2.0],
+                random_seed=42,
+                rampup=0.0,
+                constraints={},
+                max_seconds=None,
+                max_requests=2,
+                max_errors=None,
+                max_error_rate=None,
+                max_global_error_rate=None,
+                data=[str(trace)],
+                data_args=[{"timestamp_column": "ts"}],
+                data_samples=2,
+            )
         )
-        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 2.0], abs=1e-9)
+
+        assert isinstance(profile, ReplayProfile)
+        assert profile.relative_timestamps == pytest.approx([0.0, 3.0], abs=1e-9)
+        assert profile.time_scale == 2.0
+        assert profile.constraints["max_requests"] == 2
 
     @pytest.mark.smoke
-    def test_data_samples_and_constraints_are_independent(self, tmp_path: Path):
-        """data_samples truncates timestamps without mutating runtime constraints."""
+    def test_next_strategy_returns_trace_then_none(self, tmp_path: Path):
         trace = _trace_path(
             tmp_path,
-            [
-                '{"timestamp": 0, "input_length": 1, "output_length": 1}',
-                '{"timestamp": 1.0, "input_length": 2, "output_length": 2}',
-                '{"timestamp": 2.0, "input_length": 3, "output_length": 3}',
-                '{"timestamp": 3.0, "input_length": 4, "output_length": 4}',
-                '{"timestamp": 4.0, "input_length": 5, "output_length": 5}',
-            ],
+            ['{"timestamp": 0, "input_length": 1, "output_length": 1}'],
         )
         kwargs = ReplayProfile.resolve_args(
             rate_type="replay",
-            rate=[1.0],
+            rate=[2.0],
             random_seed=42,
             data=[str(trace)],
-            data_samples=3,
-            constraints={"max_requests": 10, "max_seconds": 0.25},
         )
-        assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0, 2.0], abs=1e-9)
-        assert kwargs["constraints"] == {"max_requests": 10, "max_seconds": 0.25}
+        profile = ReplayProfile(**kwargs)
+
+        strategy = profile.next_strategy(None, None)
+        assert profile.strategy_types == ["trace"]
+        assert isinstance(strategy, TraceReplayStrategy)
+        assert strategy.relative_timestamps == [0.0]
+        assert strategy.time_scale == 2.0
+        assert profile.next_strategy(strategy, None) is None
diff --git a/tests/unit/data/deserializers/test_trace_synthetic.py b/tests/unit/data/deserializers/test_trace_synthetic.py
index f6610969d..9a98097a4 100644
--- a/tests/unit/data/deserializers/test_trace_synthetic.py
+++ b/tests/unit/data/deserializers/test_trace_synthetic.py
@@ -1,12 +1,3 @@
-## WRITTEN BY AI ##
-
-"""
-Unit tests for TraceSyntheticDatasetDeserializer.
-
-Ensures trace file is loaded and synthetic prompts are generated with exact
-input_length.
-"""
-
 from __future__ import annotations
 
 from pathlib import Path
@@ -21,114 +12,142 @@
 from guidellm.data.schemas import DataNotSupportedError
 
 
-def _mock_processor():
-    """Tokenizer that returns token count = number of words in text."""
+def _mock_processor() -> Mock:
+    """Tokenizer where each whitespace-delimited word is one token."""
     proc = Mock()
-    proc.encode.side_effect = lambda text: list(range(max(1, len(text.split()))))
+    proc.encode.side_effect = lambda text: list(range(len(text.split())))
     proc.decode.side_effect = lambda tokens, skip_special_tokens=False: " ".join(
-        "x" for _ in range(len(tokens))
+        f"tok{i}" for i, _ in enumerate(tokens)
     )
     return proc
 
 
-def _deserialize(deserializer, data, **kwargs):
-    defaults = {
-        "processor_factory": _mock_processor,
-        "random_seed": 42,
-    }
-    return deserializer(**{**defaults, "data": data, **kwargs})
+def _write_trace(tmp_path: Path, content: str, suffix: str = ".jsonl") -> Path:
+    path = tmp_path / f"trace{suffix}"
+    path.write_text(content)
+    return path
 
 
 class TestTraceSyntheticDatasetDeserializer:
-    """Tests for TraceSyntheticDatasetDeserializer."""
-
     @pytest.fixture
-    def deserializer(self):
+    def deserializer(self) -> TraceSyntheticDatasetDeserializer:
         return TraceSyntheticDatasetDeserializer()
 
+    def _deserialize(self, deserializer, data, **kwargs):
+        return deserializer(
+            data=data,
+            processor_factory=_mock_processor,
+            random_seed=42,
+            **kwargs,
+        )
+
     @pytest.mark.smoke
+    def test_loads_sorted_rows_and_keeps_token_columns_aligned(
+        self, tmp_path: Path, deserializer
+    ):
+        trace = _write_trace(
+            tmp_path,
+            '{"timestamp": 5.0, "input_length": 3, "output_length": 30}\n'
+            '{"timestamp": 2.0, "input_length": 1, "output_length": 10}\n'
+            '{"timestamp": 2.0, "input_length": 2, "output_length": 20}\n'
+            '{"timestamp": 8.0, "input_length": 0, "output_length": 40}\n',
+        )
+
+        ds = self._deserialize(deserializer, trace, type_="trace_synthetic")
+
+        assert isinstance(ds, Dataset)
+        assert ds["prompt_tokens_count"] == [1, 2, 3, 0]
+        assert ds["output_tokens_count"] == [10, 20, 30, 40]
+        for prompt, token_count in zip(
+            ds["prompt"], ds["prompt_tokens_count"], strict=True
+        ):
+            assert len(_mock_processor().encode(prompt)) == token_count
+
+    @pytest.mark.smoke
+    def test_honors_custom_column_names(self, tmp_path: Path, deserializer):
+        trace = _write_trace(
+            tmp_path,
+            '{"ts": 3.0, "input_tokens": 4, "generated_tokens": 40}\n'
+            '{"ts": 1.0, "input_tokens": 2, "generated_tokens": 20}\n',
+        )
+
+        ds = self._deserialize(
+            deserializer,
+            trace,
+            type_="trace_synthetic",
+            timestamp_column="ts",
+            prompt_tokens_column="input_tokens",
+            output_tokens_column="generated_tokens",
+        )
+
+        assert ds["prompt_tokens_count"] == [2, 4]
+        assert ds["output_tokens_count"] == [20, 40]
+
+    @pytest.mark.smoke
+    def test_rejects_invalid_data(self, deserializer):
+        with pytest.raises(DataNotSupportedError, match="path to a trace file"):
+            self._deserialize(deserializer, 123)
+
+    @pytest.mark.sanity
     @pytest.mark.parametrize(
-        ("content", "expected"),
+        ("content", "kwargs", "match"),
         [
-            # Basic small counts
+            ("", {}, "empty"),
             (
-                '{"timestamp": 0, "input_length": 50, "output_length": 20}\n'
-                '{"timestamp": 0.5, "input_length": 100, "output_length": 30}\n',
-                [(50, 20), (100, 30)],
+                '{"ts": 0, "input_length": 10, "output_length": 5}\n',
+                {},
+                "timestamp",
             ),
-            # Production-scale token counts (4K-128K contexts)
             (
-                '{"timestamp": 0, "input_length": 4096, "output_length": 512}\n'
-                '{"timestamp": 1.0, "input_length": 8192, "output_length": 1024}\n'
-                '{"timestamp": 2.0, "input_length": 32768, "output_length": 4096}\n'
-                '{"timestamp": 3.0, "input_length": 131072, "output_length": 8192}\n',
-                [(4096, 512), (8192, 1024), (32768, 4096), (131072, 8192)],
+                '{"timestamp": 0, "input_length": 10}\n',
+                {},
+                "output_length",
             ),
-            # Mixed high/low alternating (edge cases)
             (
-                '{"timestamp": 0, "input_length": 10, "output_length": 5}\n'
-                '{"timestamp": 0.1, "input_length": 65536, "output_length": 16384}\n'
-                '{"timestamp": 0.2, "input_length": 20, "output_length": 10}\n'
-                '{"timestamp": 0.3, "input_length": 131072, "output_length": 32768}\n',
-                [(10, 5), (65536, 16384), (20, 10), (131072, 32768)],
+                '{"timestamp": 0, "prompt_tokens": 10, "output_length": 5}\n',
+                {
+                    "prompt_tokens_column": "prompt_tokens",
+                    "output_tokens_column": "out",
+                },
+                "out",
             ),
-            # Unsorted timestamps with duplicates (sorts by timestamp)
             (
-                '{"timestamp": 5.0, "input_length": 100, "output_length": 10}\n'
-                '{"timestamp": 2.0, "input_length": 200, "output_length": 20}\n'
-                '{"timestamp": 8.0, "input_length": 300, "output_length": 30}\n'
-                '{"timestamp": 2.0, "input_length": 400, "output_length": 40}\n',
-                [(200, 20), (400, 40), (100, 10), (300, 30)],
+                '{"timestamp": "bad", "input_length": 10, "output_length": 5}\n',
+                {},
+                "could not convert",
             ),
-            # Concurrent burst (5 requests at same timestamp)
             (
-                '{"timestamp": 1.0, "input_length": 100, "output_length": 10}\n'
-                '{"timestamp": 1.0, "input_length": 200, "output_length": 20}\n'
-                '{"timestamp": 1.0, "input_length": 300, "output_length": 30}\n'
-                '{"timestamp": 1.0, "input_length": 400, "output_length": 40}\n'
-                '{"timestamp": 1.0, "input_length": 500, "output_length": 50}\n',
-                [(100, 10), (200, 20), (300, 30), (400, 40), (500, 50)],
+                '{"timestamp": 0, "input_length": "bad", "output_length": 5}\n',
+                {},
+                "invalid literal",
+            ),
+            (
+                '{"timestamp": 0, "input_length": 10, "output_length": null}\n',
+                {},
+                "NoneType",
+            ),
+            (
+                '{"timestamp": 0, "input_length": 10, "output_length": 5}\nnot-json\n',
+                {},
+                "generating the dataset",
             ),
         ],
     )
-    def test_load_jsonl_various_scenarios(
-        self, tmp_path: Path, deserializer, content, expected
+    def test_trace_validation_raises(
+        self, tmp_path: Path, deserializer, content, kwargs, match
     ):
-        """Trace JSONL yields exact token counts (small, large, mixed, unsorted,
-        duplicates)."""
-        trace = tmp_path / "trace.jsonl"
-        trace.write_text(content)
-        ds = _deserialize(deserializer, str(trace), type_="trace_synthetic")
-        assert isinstance(ds, Dataset)
-        assert len(ds) == len(expected)
-        assert set(ds.column_names) >= {
-            "prompt",
-            "prompt_tokens_count",
-            "output_tokens_count",
-        }
-        for row, (in_len, out_len) in zip(ds, expected, strict=True):
-            assert row["prompt_tokens_count"] == in_len
-            assert row["output_tokens_count"] == out_len
+        trace = _write_trace(tmp_path, content)
 
-    @pytest.mark.smoke
-    def test_rejects_invalid_data(self, deserializer):
-        """Non-path data raises DataNotSupportedError."""
-        with pytest.raises(DataNotSupportedError, match="path to a trace file"):
-            _deserialize(deserializer, 123)
+        with pytest.raises(DataNotSupportedError, match=match):
+            self._deserialize(deserializer, trace, **kwargs)
 
     @pytest.mark.sanity
-    @pytest.mark.parametrize(
-        ("content", "match"),
-        [
-            ("", "empty"),
-            ('{"ts": 0, "input_length": 10, "output_length": 5}\n', "timestamp"),
-        ],
-    )
-    def test_trace_validation_raises(
-        self, tmp_path: Path, deserializer, content, match
-    ):
-        """Empty trace or missing required column raises DataNotSupportedError."""
-        trace = tmp_path / "trace.jsonl"
-        trace.write_text(content)
-        with pytest.raises(DataNotSupportedError, match=match):
-            _deserialize(deserializer, str(trace))
+    def test_unsupported_file_suffix_raises(self, tmp_path: Path, deserializer):
+        trace = _write_trace(
+            tmp_path,
+            '{"timestamp": 0, "input_length": 10, "output_length": 5}\n',
+            suffix=".json",
+        )
+
+        with pytest.raises(DataNotSupportedError, match=r"Unsupported.*\.json"):
+            self._deserialize(deserializer, trace)
diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
index 37a14b614..d35c4170e 100644
--- a/tests/unit/scheduler/test_trace_replay.py
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -1,170 +1,118 @@
-"""
-Unit tests for trace replay strategy and load_relative_timestamps.
-
-Verifies that TraceReplayStrategy schedules requests at start_time + time_scale
-* relative_timestamp[i] and that load_relative_timestamps correctly parses trace
-files.
-"""
-
 from __future__ import annotations
 
 import asyncio
-import json
 import math
 from multiprocessing import get_context
 from pathlib import Path
 
 import pytest
+from datasets.exceptions import DatasetGenerationError
 
 from guidellm.scheduler import SchedulingStrategy, TraceReplayStrategy
 from guidellm.schemas import RequestInfo
 from guidellm.utils.trace_io import load_relative_timestamps
 
 
-def _write_trace(path: Path, content: str) -> Path:
+def _write_trace(tmp_path: Path, content: str, suffix: str = ".jsonl") -> Path:
+    path = tmp_path / f"trace{suffix}"
     path.write_text(content)
     return path
 
 
 class TestLoadRelativeTimestamps:
-    """Tests for load_relative_timestamps helper."""
+    @pytest.mark.smoke
+    def test_loads_sorted_relative_timestamps_with_duplicates(self, tmp_path: Path):
+        trace = _write_trace(
+            tmp_path,
+            '{"timestamp": 5.0, "input_length": 10, "output_length": 10}\n'
+            '{"timestamp": 2.0, "input_length": 20, "output_length": 20}\n'
+            '{"timestamp": 2.0, "input_length": 30, "output_length": 30}\n'
+            '{"timestamp": 8.0, "input_length": 40, "output_length": 40}\n',
+        )
+
+        assert load_relative_timestamps(trace) == pytest.approx(
+            [0.0, 0.0, 3.0, 6.0], abs=1e-9
+        )
+
+    @pytest.mark.smoke
+    def test_loads_custom_timestamp_column(self, tmp_path: Path):
+        trace = _write_trace(
+            tmp_path,
+            '{"ts": 10.0, "input_length": 10, "output_length": 10}\n'
+            '{"ts": 10.25, "input_length": 20, "output_length": 20}\n',
+        )
+
+        assert load_relative_timestamps(trace, timestamp_column="ts") == pytest.approx(
+            [0.0, 0.25], abs=1e-9
+        )
 
     @pytest.mark.smoke
     @pytest.mark.parametrize(
-        ("content", "kwargs", "expected"),
+        ("suffix", "content", "error_type", "match"),
         [
-            # Basic cases
+            (".jsonl", "", ValueError, "no valid rows"),
             (
-                '{"timestamp": 100, "input_length": 10}\n'
-                '{"timestamp": 100.5, "input_length": 20}\n'
-                '{"timestamp": 101.2, "input_length": 15}\n',
-                {"timestamp_column": "timestamp"},
-                [0.0, 0.5, 1.2],
+                ".json",
+                '[{"timestamp": 0, "input_length": 10, "output_length": 100}]',
+                ValueError,
+                r"Unsupported.*\.json",
             ),
             (
-                '{"ts": 0, "input_length": 1}\n{"ts": 2.5, "input_length": 2}\n',
-                {"timestamp_column": "ts"},
-                [0.0, 2.5],
+                ".csv",
+                "timestamp,input_length,output_length\n0,10,100\n",
+                ValueError,
+                r"Unsupported.*\.csv",
             ),
-            # High token counts (production-like: 2K-128K contexts)
             (
-                '{"timestamp": 0, "input_length": 2048, "output_length": 512}\n'
-                '{"timestamp": 1.5, "input_length": 4096, "output_length": 1024}\n'
-                '{"timestamp": 3.0, "input_length": 8192, "output_length": 2048}\n'
-                '{"timestamp": 4.5, "input_length": 32768, "output_length": 8192}\n'
-                '{"timestamp": 6.0, "input_length": 131072, "output_length": 32768}\n',
-                {"timestamp_column": "timestamp"},
-                [0.0, 1.5, 3.0, 4.5, 6.0],
+                ".jsonl",
+                '{"ts": 0, "input_length": 10, "output_length": 100}\n',
+                KeyError,
+                "timestamp",
             ),
-            # Unsorted timestamps (sorted chronologically, all >= 0)
             (
-                '{"timestamp": 5.0, "input_length": 10}\n'
-                '{"timestamp": 2.0, "input_length": 20}\n'
-                '{"timestamp": 8.0, "input_length": 30}\n',
-                {"timestamp_column": "timestamp"},
-                [0.0, 3.0, 6.0],  # Sorted: 2.0, 5.0, 8.0 -> 0.0, 3.0, 6.0
+                ".jsonl",
+                '{"timestamp": "bad", "input_length": 10, "output_length": 100}\n',
+                ValueError,
+                "could not convert",
             ),
-            # Duplicate timestamps (concurrent burst)
             (
-                '{"timestamp": 1.0, "input_length": 10}\n'
-                '{"timestamp": 1.0, "input_length": 20}\n'
-                '{"timestamp": 1.0, "input_length": 30}\n'
-                '{"timestamp": 2.5, "input_length": 40}\n',
-                {"timestamp_column": "timestamp"},
-                [0.0, 0.0, 0.0, 1.5],
+                ".jsonl",
+                '{"timestamp": 0, "input_length": 10, "output_length": 100}\n'
+                "not-json\n",
+                DatasetGenerationError,
+                "generating the dataset",
             ),
         ],
     )
-    def test_load_valid_jsonl(self, tmp_path: Path, content, kwargs, expected):
-        """Load JSONL trace and get sorted relative timestamps (basic, high counts,
-        unsorted, duplicates)."""
-        trace = tmp_path / "trace.jsonl"
-        _write_trace(trace, content)
-        out = load_relative_timestamps(trace, **kwargs)
-        assert out == pytest.approx(expected, abs=1e-9)
-
-    @pytest.mark.smoke
-    def test_empty_trace_raises(self, tmp_path: Path):
-        """Empty trace file raises ValueError."""
-        trace = tmp_path / "trace.jsonl"
-        _write_trace(trace, "")
-        with pytest.raises(ValueError, match="no valid rows"):
-            load_relative_timestamps(trace)
+    def test_invalid_trace_inputs_raise(
+        self, tmp_path: Path, suffix, content, error_type, match
+    ):
+        trace = _write_trace(tmp_path, content, suffix=suffix)
 
-    @pytest.mark.smoke
-    @pytest.mark.parametrize(
-        ("suffix", "content", "match"),
-        [
-            (
-                "json",
-                json.dumps(
-                    [
-                        {"timestamp": 0, "input_length": 1},
-                        {"timestamp": 1.0, "input_length": 2},
-                    ]
-                ),
-                r"Unsupported trace file format.*\.json",
-            ),
-            (
-                "csv",
-                "timestamp,input_length,output_length\n0,10,5\n0.3,20,10\n",
-                r"Unsupported trace file format.*\.csv",
-            ),
-            ("txt", "0\n1\n", "Unsupported trace file format"),
-        ],
-    )
-    def test_unsupported_format_raises(self, tmp_path: Path, suffix, content, match):
-        """JSON array, CSV, or unknown suffix raises ValueError."""
-        trace = tmp_path / f"trace.{suffix}"
-        _write_trace(trace, content)
-        with pytest.raises(ValueError, match=match):
+        with pytest.raises(error_type, match=match):
             load_relative_timestamps(trace)
 
 
 class TestTraceReplayStrategy:
-    """Tests for TraceReplayStrategy."""
-
     @pytest.mark.smoke
-    @pytest.mark.parametrize(
-        ("timestamps", "time_scale"),
-        [
-            ([0.0, 0.5, 1.0], 2.0),
-            ([0.0, 1.0], 0.5),
-        ],
-    )
-    def test_initialization_and_str(self, timestamps, time_scale):
-        """Init, type_, optional str, and limits."""
+    def test_initialization_and_serialization(self):
         strategy = TraceReplayStrategy(
-            relative_timestamps=timestamps,
-            time_scale=time_scale,
+            relative_timestamps=[0.0, 0.5, 1.0],
+            time_scale=2.0,
         )
+
         assert strategy.type_ == "trace"
-        assert strategy.relative_timestamps == timestamps
-        assert strategy.time_scale == time_scale
+        assert str(strategy) == "trace@2.00"
         assert strategy.processes_limit is None
-        assert strategy.requests_limit == len(timestamps)
-        if time_scale == 0.5:
-            assert str(strategy) == "trace@0.50"
+        assert strategy.requests_limit == 3
 
-    @pytest.mark.smoke
-    def test_marshalling(self):
-        """Pydantic dump/load and polymorphic restore."""
-        strategy = TraceReplayStrategy(
-            relative_timestamps=[0.0, 1.0, 2.0],
-            time_scale=1.5,
-        )
-        data = strategy.model_dump()
-        assert data["type_"] == "trace"
-        assert data["relative_timestamps"] == [0.0, 1.0, 2.0]
-        assert data["time_scale"] == 1.5
-        reconstructed = TraceReplayStrategy.model_validate(data)
-        assert reconstructed.relative_timestamps == strategy.relative_timestamps
-        base = SchedulingStrategy.model_validate(data)
-        assert isinstance(base, TraceReplayStrategy)
+        restored = SchedulingStrategy.model_validate(strategy.model_dump())
+        assert isinstance(restored, TraceReplayStrategy)
+        assert restored.relative_timestamps == [0.0, 0.5, 1.0]
+        assert restored.time_scale == 2.0
 
     @pytest.mark.smoke
-    def test_next_request_time_scaled_timestamps(self):
-        """next_request_time returns start_time + time_scale * relative_ts[i]."""
+    def test_next_request_time_scales_timestamps_and_exhausts_trace(self):
         strategy = TraceReplayStrategy(
             relative_timestamps=[0.0, 0.5, 1.0],
             time_scale=2.0,
@@ -175,45 +123,33 @@ def test_next_request_time_scaled_timestamps(self):
             mp_context=get_context(),
         )
         strategy.init_processes_start(1000.0)
-        expected = [1000.0, 1001.0, 1002.0]
 
         async def run():
-            for exp in expected:
-                t = await strategy.next_request_time(0)
-                assert t == pytest.approx(exp, abs=1e-6)
+            return [await strategy.next_request_time(0) for _ in range(4)]
 
-        asyncio.run(run())
+        assert asyncio.run(run()) == pytest.approx(
+            [1000.0, 1001.0, 1002.0, math.inf], abs=1e-6
+        )
 
     @pytest.mark.smoke
-    def test_next_request_time_beyond_trace_parks_worker(self):
-        """When index > len(relative_timestamps), return math.inf to park the slot.
-
-        Returning math.inf causes the worker to sleep indefinitely until
-        constraint_reached_event cancels it, preventing it from racing the
-        messaging queue with a stale target timestamp.
-        """
-        strategy = TraceReplayStrategy(
-            relative_timestamps=[0.0, 1.0],
-            time_scale=1.0,
-        )
+    def test_empty_trace_has_no_request_limit_and_uses_start_time(self):
+        strategy = TraceReplayStrategy(relative_timestamps=[], time_scale=1.0)
         strategy.init_processes_timings(
             worker_count=1,
             max_concurrency=10,
             mp_context=get_context(),
         )
-        strategy.init_processes_start(500.0)
+        strategy.init_processes_start(123.0)
+
+        assert strategy.requests_limit is None
 
         async def run():
-            await strategy.next_request_time(0)
-            await strategy.next_request_time(0)
-            t3 = await strategy.next_request_time(0)
-            assert t3 == math.inf
+            return await strategy.next_request_time(0)
 
-        asyncio.run(run())
+        assert asyncio.run(run()) == pytest.approx(123.0)
 
     @pytest.mark.smoke
     def test_request_completed_no_op(self):
-        """request_completed is a no-op."""
         strategy = TraceReplayStrategy(relative_timestamps=[0.0], time_scale=1.0)
         info = RequestInfo(
             request_id="x",
@@ -222,39 +158,3 @@ def test_request_completed_no_op(self):
             scheduler_start_time=0,
         )
         strategy.request_completed(info)
-
-    @pytest.mark.sanity
-    @pytest.mark.parametrize(
-        ("timestamps", "expected"),
-        [
-            # Concurrent burst: 3 requests at same time
-            ([0.0, 0.0, 0.0, 1.0, 2.0], [1000.0, 1000.0, 1000.0, 1001.0, 1002.0]),
-            # Unsorted timestamps (sorted by load_relative_timestamps, all >= 0)
-            ([0.0, 3.0, 5.0, 6.0], [1000.0, 1003.0, 1005.0, 1006.0]),
-            # High frequency burst (millisecond scale)
-            (
-                [0.0, 0.001, 0.002, 0.003, 0.004],
-                [1000.0, 1000.001, 1000.002, 1000.003, 1000.004],
-            ),
-        ],
-    )
-    def test_scheduling_patterns(self, timestamps, expected):
-        """Test concurrent bursts, unsorted timestamps (now sorted), and high-frequency
-        patterns."""
-        strategy = TraceReplayStrategy(
-            relative_timestamps=timestamps,
-            time_scale=1.0,
-        )
-        strategy.init_processes_timings(
-            worker_count=3,
-            max_concurrency=10,
-            mp_context=get_context(),
-        )
-        strategy.init_processes_start(1000.0)
-
-        async def run():
-            for exp in expected:
-                t = await strategy.next_request_time(0)
-                assert t == pytest.approx(exp, abs=1e-6)
-
-        asyncio.run(run())

From 09dcb32fa0d098eebfce2677b07c44481388786d Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Tue, 28 Apr 2026 19:53:16 +0200
Subject: [PATCH 21/27] fix ci: fix mdformat pre-commit on datasets guide

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 docs/guides/datasets.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md
index a181552b8..8558c10fb 100644
--- a/docs/guides/datasets.md
+++ b/docs/guides/datasets.md
@@ -120,29 +120,38 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
 #### Supported Formats with Examples
 
 - **Text files (`.txt`, `.text`)**: Where each line is a separate prompt to use.
+
   ```
   Hello, how are you?
   What is your name?
   ```
+
 - **CSV files (`.csv`)**: Where each row is a separate dataset entry and the first row contains the column names. The columns should include `prompt` or other common names for the prompt which will be used as the prompt column. Additional columns can be included based on the previously mentioned aliases for the `--data-column-mapper` argument.
+
   ```csv
   prompt,output_tokens_count,additional_column,additional_column2
   Hello, how are you?,5,foo,bar
   What is your name?,3,baz,qux
   ```
+
 - **JSON Lines files (`.jsonl`)**: Where each line is a separate JSON object. The objects should include `prompt` or other common names for the prompt which will be used as the prompt column. Additional fields can be included based on the previously mentioned aliases for the `--data-args` argument.
+
   ```json
   {"prompt": "Hello, how are you?", "output_tokens_count": 5, "additional_column": "foo", "additional_column2": "bar"}
   {"prompt": "What is your name?", "output_tokens_count": 3, "additional_column": "baz", "additional_column2": "qux"}
   ```
+
 - **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to replay trace events using each row's timestamp and token lengths. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
+
   ```json
   {"timestamp": 0, "input_length": 256, "output_length": 128}
   {"timestamp": 0.5, "input_length": 512, "output_length": 64}
   ```
+
   Trace rows are ordered by timestamp before GuideLLM schedules requests and generates synthetic payloads. This keeps each scheduled event aligned with the prompt and output token lengths from the same row.
 
   Use `--data-args '{"type_": "trace_synthetic"}'` to enable trace loading. If your trace uses different column names, configure them with `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column`:
+
   ```bash
   guidellm benchmark \
       --target "http://localhost:8000" \
@@ -151,8 +160,11 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
       --data "path/to/trace.jsonl" \
       --data-args '{"type_": "trace_synthetic", "timestamp_column": "ts", "prompt_tokens_column": "input_tokens", "output_tokens_column": "output_tokens"}'
   ```
+
   For replay, `--rate` is a time scale rather than requests per second. Use `--data-samples` to limit how many trace rows are loaded and replayed. Use `--max-requests` only as a runtime completion constraint; it does not limit the trace rows loaded from the file.
+
 - **JSON files (`.json`)**: Where the entire dataset is represented as a JSON array of objects nested under a specific key. To surface the correct key to use, a `--data-column-mapper` argument must be passed in of `"field": "NAME"` for where the array exists. The objects should include `prompt` or other common names for the prompt which will be used as the prompt column. Additional fields can be included based on the previously mentioned aliases for the `--data-column-mapper` argument.
+
   ```json
   {
     "version": "1.0",
@@ -162,8 +174,11 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
     ]
   }
   ```
+
 - **Parquet files (`.parquet`)** Example: A binary columnar storage format for efficient data processing. For more information on the supported formats, see the Hugging Face dataset documentation linked in the [Notes](#notes) section.
+
 - **Arrow files (`.arrow`)** Example: A cross-language development platform for in-memory data. For more information on the supported formats, see the Hugging Face dataset documentation linked in the [Notes](#notes) section.
+
 - **HDF5 files (`.hdf5`)** Example: A hierarchical data format for storing large amounts of data. For more information on the supported formats, see the Hugging Face dataset documentation linked in the [Notes](#notes) section.
 
 #### Example Commands

From 6d7eac52d2e10dc8c9eefe51517c531bee5df4c7 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Wed, 29 Apr 2026 12:01:27 +0200
Subject: [PATCH 22/27] docs: clarify trace replay timestamp semantics

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 docs/getting-started/benchmark.md | 16 +++++++++-------
 docs/guides/datasets.md           | 16 +++++++++-------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
index dd3c29d15..4f31f53b6 100644
--- a/docs/getting-started/benchmark.md
+++ b/docs/getting-started/benchmark.md
@@ -190,29 +190,31 @@ You can customize synthetic data generation with additional parameters such as s
 
 ### Trace Replay Benchmarking (beta)
 
-For realistic load testing, replay trace events using each row's timestamp and token lengths. Trace files must be JSONL and are loaded with the `trace_synthetic` data type. By default, each row uses `timestamp`, `input_length`, and `output_length` fields:
+For realistic load testing, replay trace events using each row's timestamp and token lengths. Trace files must be JSONL and are loaded with the `trace_synthetic` data type. By default, each row uses `timestamp`, `input_length`, and `output_length` fields. Timestamps may be absolute or monotonic values; GuideLLM sorts them and converts them to offsets from the first event before scheduling:
 
 ```json
-{"timestamp": 0, "input_length": 256, "output_length": 128}
-{"timestamp": 0.5, "input_length": 512, "output_length": 64}
+{"timestamp": 1234500.0, "input_length": 256, "output_length": 128}
+{"timestamp": 1234500.5, "input_length": 512, "output_length": 64}
 ```
 
-Run with the `replay` profile. This example also maps custom trace column names:
+In this example, the second request is scheduled 0.5 seconds after the first request.
+
+Run with the `replay` profile:
 
 ```bash
 guidellm benchmark \
   --target "http://localhost:8000" \
   --data "path/to/trace.jsonl" \
-  --data-args '{"type_": "trace_synthetic", "timestamp_column": "ts", "prompt_tokens_column": "input_tokens", "output_tokens_column": "output_tokens"}' \
+  --data-args '{"type_": "trace_synthetic"}' \
   --profile replay \
   --rate 1.0
 ```
 
-The `--rate` parameter acts as a time scale, not requests per second: `1.0` for original speed, `2.0` to multiply timestamps by 2 and run twice as long, `0.5` to multiply timestamps by 0.5 and run twice as fast.
+The `--rate` parameter acts as a time scale for the intervals between trace events, not requests per second: `1.0` preserves the original timing, `2.0` doubles the intervals and runs twice as long, and `0.5` halves the intervals and runs twice as fast.
 
 GuideLLM orders trace rows by timestamp before scheduling and payload generation, so each scheduled event uses the token lengths from the same sorted row. Use `--data-samples` to limit how many trace rows are loaded and replayed. `--max-requests` remains a runtime completion constraint; it does not truncate the trace dataset.
 
-If your trace uses the default column names shown above, omit `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column` from `--data-args`.
+If your trace uses different column names, map them with `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column` in `--data-args`.
 
 ### Working with Real Data
 
diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md
index 8558c10fb..c41458e54 100644
--- a/docs/guides/datasets.md
+++ b/docs/guides/datasets.md
@@ -141,16 +141,16 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
   {"prompt": "What is your name?", "output_tokens_count": 3, "additional_column": "baz", "additional_column2": "qux"}
   ```
 
-- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to replay trace events using each row's timestamp and token lengths. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
+- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to replay trace events using each row's timestamp and token lengths. Timestamps may be absolute or monotonic values; GuideLLM sorts them and converts them to offsets from the first event before scheduling. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
 
   ```json
-  {"timestamp": 0, "input_length": 256, "output_length": 128}
-  {"timestamp": 0.5, "input_length": 512, "output_length": 64}
+  {"timestamp": 1234500.0, "input_length": 256, "output_length": 128}
+  {"timestamp": 1234500.5, "input_length": 512, "output_length": 64}
   ```
 
-  Trace rows are ordered by timestamp before GuideLLM schedules requests and generates synthetic payloads. This keeps each scheduled event aligned with the prompt and output token lengths from the same row.
+  In this example, the second request is scheduled 0.5 seconds after the first request. Trace rows are ordered by timestamp before GuideLLM schedules requests and generates synthetic payloads. This keeps each scheduled event aligned with the prompt and output token lengths from the same row.
 
-  Use `--data-args '{"type_": "trace_synthetic"}'` to enable trace loading. If your trace uses different column names, configure them with `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column`:
+  Use `--data-args '{"type_": "trace_synthetic"}'` to enable trace loading:
 
   ```bash
   guidellm benchmark \
@@ -158,10 +158,12 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
       --profile replay \
       --rate 1.0 \
       --data "path/to/trace.jsonl" \
-      --data-args '{"type_": "trace_synthetic", "timestamp_column": "ts", "prompt_tokens_column": "input_tokens", "output_tokens_column": "output_tokens"}'
+      --data-args '{"type_": "trace_synthetic"}'
   ```
 
-  For replay, `--rate` is a time scale rather than requests per second. Use `--data-samples` to limit how many trace rows are loaded and replayed. Use `--max-requests` only as a runtime completion constraint; it does not limit the trace rows loaded from the file.
+  If your trace uses different column names, configure them with `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column`.
+
+  For replay, `--rate` is a time scale for the intervals between trace events rather than requests per second. Use `--data-samples` to limit how many trace rows are loaded and replayed. Use `--max-requests` only as a runtime completion constraint; it does not limit the trace rows loaded from the file.
 
 - **JSON files (`.json`)**: Where the entire dataset is represented as a JSON array of objects nested under a specific key. To surface the correct key to use, a `--data-column-mapper` argument must be passed in of `"field": "NAME"` for where the array exists. The objects should include `prompt` or other common names for the prompt which will be used as the prompt column. Additional fields can be included based on the previously mentioned aliases for the `--data-column-mapper` argument.
 

From b6c56f3705d71d1d7c04005a30aabba6b7ad0c02 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Thu, 30 Apr 2026 16:06:49 +0200
Subject: [PATCH 23/27] docs: clarify trace replay dataset examples and
 explanations

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 docs/getting-started/benchmark.md |  4 ++--
 docs/guides/datasets.md           | 21 +++++++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
index 4f31f53b6..ce1a1623b 100644
--- a/docs/getting-started/benchmark.md
+++ b/docs/getting-started/benchmark.md
@@ -204,8 +204,8 @@ Run with the `replay` profile:
 ```bash
 guidellm benchmark \
   --target "http://localhost:8000" \
-  --data "path/to/trace.jsonl" \
-  --data-args '{"type_": "trace_synthetic"}' \
+  --data path/to/trace.jsonl \
+  --data-args type_=trace_synthetic \
   --profile replay \
   --rate 1.0
 ```
diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md
index c41458e54..2b4ce51e1 100644
--- a/docs/guides/datasets.md
+++ b/docs/guides/datasets.md
@@ -141,7 +141,7 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
   {"prompt": "What is your name?", "output_tokens_count": 3, "additional_column": "baz", "additional_column2": "qux"}
   ```
 
-- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to replay trace events using each row's timestamp and token lengths. Timestamps may be absolute or monotonic values; GuideLLM sorts them and converts them to offsets from the first event before scheduling. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
+- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to replay trace events using each row's timestamp and token lengths. Timestamps must be numbers expressed in seconds on a shared timeline with any consistent zero point; GuideLLM sorts them and converts them to offsets from the first event before scheduling. Date strings are not parsed yet, so provide timestamps as numbers. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
 
   ```json
   {"timestamp": 1234500.0, "input_length": 256, "output_length": 128}
@@ -150,18 +150,27 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
 
   In this example, the second request is scheduled 0.5 seconds after the first request. Trace rows are ordered by timestamp before GuideLLM schedules requests and generates synthetic payloads. This keeps each scheduled event aligned with the prompt and output token lengths from the same row.
 
-  Use `--data-args '{"type_": "trace_synthetic"}'` to enable trace loading:
+  Use `--data-args type_=trace_synthetic` to enable trace loading:
 
   ```bash
   guidellm benchmark \
-      --target "http://localhost:8000" \
+      --target http://localhost:8000 \
       --profile replay \
       --rate 1.0 \
-      --data "path/to/trace.jsonl" \
-      --data-args '{"type_": "trace_synthetic"}'
+      --data path/to/trace.jsonl \
+      --data-args type_=trace_synthetic
   ```
 
-  If your trace uses different column names, configure them with `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column`.
+  If your trace uses different column names, configure them with `timestamp_column`, `prompt_tokens_column`, and `output_tokens_column`:
+
+  ```bash
+  guidellm benchmark \
+      --target http://localhost:8000 \
+      --profile replay \
+      --rate 1.0 \
+      --data replay.jsonl \
+      --data-args type_=trace_synthetic,timestamp_column=timestamp,prompt_tokens_column=input_length,output_tokens_column=output_length
+  ```
 
   For replay, `--rate` is a time scale for the intervals between trace events rather than requests per second. Use `--data-samples` to limit how many trace rows are loaded and replayed. Use `--max-requests` only as a runtime completion constraint; it does not limit the trace rows loaded from the file.
 

From cfeecc5f575e3eff201bc3341e9a344d22122cf4 Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sat, 2 May 2026 10:46:48 +0200
Subject: [PATCH 24/27] docs: clarify trace io and profiles  wording

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/profiles.py |  7 +++----
 src/guidellm/utils/trace_io.py     | 13 ++++++-------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index ab3d5ceed..951c85758 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -341,9 +341,8 @@ class ReplayProfile(Profile):
     For this profile, the ``rate`` argument is interpreted as time_scale (scale factor
     applied to relative timestamps), not as requests per second.
 
-    The trace file is read twice: once by the data pipeline for request payloads, and
-    once here for relative timestamps. When ``data_samples`` is set, the replayed
-    timestamps are truncated to match the sampled dataset size.
+    When ``data_samples`` is set, the replayed timestamps are truncated to match
+    the sampled dataset size.
     """
 
     type_: Literal["replay"] = "replay"  # type: ignore[assignment]
@@ -377,7 +376,7 @@ def resolve_args(
         time_scale = rate[0] if rate and len(rate) > 0 else 1.0
 
         # Honor a custom timestamp column when configured via --data-args so the
-        # scheduler and the trace_synthetic deserializer use the same field.
+        # replay profile and trace_synthetic deserializer use the same field.
         data_args = kwargs.get("data_args") or []
         first_args = data_args[0] if data_args else {}
         timestamp_column = "timestamp"
diff --git a/src/guidellm/utils/trace_io.py b/src/guidellm/utils/trace_io.py
index 76c4362ca..a3f1962a9 100644
--- a/src/guidellm/utils/trace_io.py
+++ b/src/guidellm/utils/trace_io.py
@@ -1,9 +1,8 @@
 """
 Shared trace file I/O for replay benchmarks.
 
-Reads trace files (.jsonl only for now) and exposes raw rows or relative timestamps.
-Used by the scheduler (load_relative_timestamps) and the trace_synthetic deserializer
-(load_trace_rows with token columns).
+Reads trace files (.jsonl only for now) and exposes rows or relative timestamps.
+Used by replay profiles and the trace_synthetic deserializer.
 """
 
 from __future__ import annotations
@@ -28,11 +27,12 @@ def load_trace_rows(
     Supports .jsonl only (one JSON object per line).
     If required_columns is set, every column must exist in the dataset;
     otherwise KeyError is raised with a descriptive message.
+    If timestamp_column is set, rows are sorted by that column.
 
     :param path: Path to the trace file.
     :param required_columns: Optional list of column/field names that each row
         must have.
-    :param timestamp_column: Optional timestamp column used to order trace rows.
+    :param timestamp_column: Optional timestamp column used to sort trace rows.
     :param data_kwargs: Additional keyword arguments forwarded to load_dataset.
     :return: HuggingFace Dataset (iterable as dicts, column-accessible).
     :raises KeyError: If a required column is missing in the dataset.
@@ -71,13 +71,12 @@ def load_relative_timestamps(
     """
     Load timestamps from a trace file and return times relative to the first event.
 
-    Trace file must be JSONL (one JSON object per line). Timestamps are sorted
-    chronologically before calculating relative times. The earliest timestamp
+    Trace file must be JSONL (one JSON object per line). The first timestamp
     becomes 0.0, and all others are relative to it (always >= 0).
 
     :param path: Path to the trace file.
     :param timestamp_column: Name of the column/field containing the timestamp.
-    :return: List of relative timestamps in seconds (first is 0.0, always sorted).
+    :return: List of relative timestamps in seconds (first is 0.0).
     :raises ValueError: If the trace file is empty or has no valid rows.
     """
     trace_dataset = load_trace_rows(path, timestamp_column=timestamp_column)

From 0a1c7eb5b1d22b720b36b53ecb075db1df560bef Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Tue, 5 May 2026 09:44:42 +0200
Subject: [PATCH 25/27] fix replay trace scheduling completion and optimize
 synthetic prompt generation

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/profiles.py            |   9 +-
 .../data/deserializers/trace_synthetic.py     |  95 +++++++++++---
 src/guidellm/scheduler/strategies.py          |  17 +--
 tests/unit/benchmark/test_replay_profile.py   |  91 +++++++++++++
 .../deserializers/test_trace_synthetic.py     | 121 ++++++++++++++++++
 tests/unit/scheduler/test_trace_replay.py     |  39 +++++-
 6 files changed, 336 insertions(+), 36 deletions(-)

diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index 951c85758..c4a76933e 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -398,10 +398,17 @@ def resolve_args(
                 "The trace is empty or all events were filtered out."
             )
 
+        constraints = dict(kwargs.get("constraints") or {})
+        if not any(
+            key in constraints
+            for key in ("max_number", "max_num", "max_requests", "max_req")
+        ):
+            constraints["max_requests"] = len(relative_timestamps)
+
         return {
             "relative_timestamps": relative_timestamps,
             "time_scale": time_scale,
-            "constraints": kwargs.get("constraints"),
+            "constraints": constraints,
         }
 
     @property
diff --git a/src/guidellm/data/deserializers/trace_synthetic.py b/src/guidellm/data/deserializers/trace_synthetic.py
index 43146c1b0..b732ff5b3 100644
--- a/src/guidellm/data/deserializers/trace_synthetic.py
+++ b/src/guidellm/data/deserializers/trace_synthetic.py
@@ -26,32 +26,76 @@
 __all__ = ["TraceSyntheticDatasetDeserializer"]
 
 
-def _create_prompt(
+def _encode_prompt(
     processor: PreTrainedTokenizerBase,
-    prompt_tokens_count: int,
-    faker: Faker,
-    unique: str = "",
+    text: str,
+) -> list[int]:
+    """Encode text with the configured tokenizer defaults."""
+    return processor.encode(text)
+
+
+def _decode_prompt(
+    processor: PreTrainedTokenizerBase,
+    token_ids: list[int],
 ) -> str:
-    """Generate text that tokenizes to exactly prompt_tokens_count tokens."""
-    prompt_token_ids: list[int] = []
-    avg_chars_per_token = 5
-    margin_of_safety = 1.5
+    """Decode token ids into a prompt string."""
+    decoded = processor.decode(token_ids, skip_special_tokens=True)
+    if isinstance(decoded, list):
+        return decoded[0] if decoded else ""
+    return decoded
+
+
+def _create_base_prompt_token_ids(
+    processor: PreTrainedTokenizerBase,
+    faker: Faker,
+    token_count: int,
+) -> list[int]:
+    """Generate reusable synthetic token ids for trace prompt construction."""
+    if token_count <= 0:
+        return []
+
+    token_text = (faker.word() or "x")[0]
+    text = token_text
+    token_ids = _encode_prompt(processor, text)
+    max_attempts = 8
     attempts = 0
 
-    while len(prompt_token_ids) < prompt_tokens_count:
+    while len(token_ids) < token_count and attempts < max_attempts:
         attempts += 1
-        num_chars = int(
-            prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
+        missing_tokens = token_count - len(token_ids)
+        text = f"{text} {' '.join([token_text] * missing_tokens)}"
+        token_ids = _encode_prompt(processor, text)
+
+    if len(token_ids) < token_count:
+        raise DataNotSupportedError(
+            "Could not generate enough synthetic prompt tokens for "
+            f"{token_count} tokens after {max_attempts} attempts"
         )
-        text = unique + faker.text(max_nb_chars=num_chars)
-        prompt_token_ids = processor.encode(text)
 
-    decoded = processor.decode(
-        prompt_token_ids[:prompt_tokens_count], skip_special_tokens=True
-    )
-    if isinstance(decoded, list):
-        return decoded[0] if decoded else ""
-    return decoded
+    return token_ids
+
+
+def _create_prompt(
+    processor: PreTrainedTokenizerBase,
+    prompt_tokens_count: int,
+    base_prompt_token_ids: list[int],
+    request_index: int,
+) -> str:
+    """Build a prompt from unique prefix tokens and reusable base prompt tokens."""
+    if prompt_tokens_count <= 0:
+        return ""
+
+    unique_prefix = f"guidellm-trace-request-{request_index}: "
+    prefix_token_ids = _encode_prompt(processor, unique_prefix)
+    prompt_token_ids = (prefix_token_ids + base_prompt_token_ids)[:prompt_tokens_count]
+    if len(prompt_token_ids) < prompt_tokens_count:
+        raise DataNotSupportedError(
+            "Could not build a synthetic prompt with "
+            f"{prompt_tokens_count} tokens from generated base tokens"
+        )
+
+    prompt = _decode_prompt(processor, prompt_token_ids)
+    return prompt
 
 
 def _load_trace_rows(
@@ -128,6 +172,10 @@ def __call__(
         processor = processor_factory()
         faker = Faker()
         faker.seed_instance(random_seed)
+        max_prompt_tokens = max(row["prompt_tokens"] for row in rows)
+        base_prompt_token_ids = _create_base_prompt_token_ids(
+            processor, faker, max_prompt_tokens
+        )
 
         prompts: list[str] = []
         prompt_tokens_counts: list[int] = []
@@ -135,7 +183,14 @@ def __call__(
         for i, row in enumerate(rows):
             n_in = row["prompt_tokens"]
             n_out = row["output_tokens"]
-            prompt = _create_prompt(processor, n_in, faker, unique=f"{i} ")
+            if n_in < 0 or n_out < 0:
+                raise DataNotSupportedError(
+                    "Trace token counts must be non-negative, got "
+                    f"input_length={n_in}, output_length={n_out}"
+                )
+            prompt = _create_prompt(
+                processor, n_in, base_prompt_token_ids, request_index=i
+            )
             prompts.append(prompt)
             prompt_tokens_counts.append(n_in)
             output_tokens_counts.append(n_out)
diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
index a38e61974..33058000f 100644
--- a/src/guidellm/scheduler/strategies.py
+++ b/src/guidellm/scheduler/strategies.py
@@ -698,13 +698,13 @@ def __str__(self) -> str:
 
     @property
     def processes_limit(self) -> PositiveInt | None:
-        return None
+        # TODO: Support multi-process trace replay once each scheduled timestamp
+        # is bound to its request before workers compete for queue items.
+        return 1
 
     @property
     def requests_limit(self) -> PositiveInt | None:
-        if not self.relative_timestamps:
-            return None
-        return len(self.relative_timestamps)
+        return None
 
     async def next_request_time(self, worker_index: NonNegativeInt) -> float:
         _ = worker_index
@@ -714,10 +714,11 @@ async def next_request_time(self, worker_index: NonNegativeInt) -> float:
 
         idx = self.next_request_index()
         if idx > len(self.relative_timestamps):
-            # Trace exhausted: signal the worker to wait for constraint_reached_event.
-            # math.inf tells the worker the trace is done; it will wait for the
-            # constraint to be reached instead of scheduling more requests.
-            return math.inf
+            # Trace exhausted: park this worker slot until the scheduler cancels
+            # the processing loop via constraint_reached_event. CancelledError
+            # propagates up cleanly, matching the exit path of all other strategies.
+            await asyncio.Event().wait()
+
         return start_time + self.time_scale * self.relative_timestamps[idx - 1]
 
     def request_completed(self, request_info: RequestInfo):
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index b1ce75f2c..215fa302e 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -77,6 +77,7 @@ def test_profile_create_resolves_timestamps_and_time_scale(
         assert isinstance(profile, ReplayProfile)
         assert profile.relative_timestamps == pytest.approx([0.0, 3.0, 6.0], abs=1e-9)
         assert profile.time_scale == expected_scale
+        assert profile.constraints["max_requests"] == 3
 
     @pytest.mark.sanity
     def test_non_positive_time_scale_is_rejected(self, tmp_path: Path):
@@ -113,6 +114,94 @@ def test_custom_timestamp_column_via_data_args(self, tmp_path: Path):
         )
 
         assert kwargs["relative_timestamps"] == pytest.approx([0.0, 3.0, 6.0], abs=1e-9)
+        assert kwargs["constraints"]["max_requests"] == 3
+
+    @pytest.mark.smoke
+    def test_large_bursty_trace_sets_default_request_constraint(
+        self, tmp_path: Path
+    ):
+        prompt_lengths = [
+            6755,
+            7319,
+            7234,
+            2287,
+            9013,
+            6506,
+            4824,
+            3119,
+            23090,
+            3135,
+            26874,
+            10487,
+            17448,
+            6253,
+            6725,
+            13538,
+            87162,
+            6166,
+            6320,
+            2007,
+            3174,
+            3131,
+            3159,
+            6820,
+            3154,
+            9416,
+            7460,
+        ]
+        timestamps = [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            2.0,
+            2.0,
+            2.0,
+            2.0,
+        ]
+        trace = _trace_path(
+            tmp_path,
+            [
+                (
+                    f'{{"timestamp": {timestamp}, '
+                    f'"input_length": {prompt_length}, "output_length": 1}}'
+                )
+                for timestamp, prompt_length in zip(
+                    timestamps, prompt_lengths, strict=True
+                )
+            ],
+        )
+
+        kwargs = ReplayProfile.resolve_args(
+            rate_type="replay",
+            rate=[1.0],
+            random_seed=42,
+            data=[str(trace)],
+        )
+
+        assert kwargs["relative_timestamps"] == pytest.approx(
+            timestamps, abs=1e-9
+        )
+        assert kwargs["constraints"]["max_requests"] == 27
 
     @pytest.mark.smoke
     @pytest.mark.parametrize("invalid_value", [None, "", "   ", 123, False, []])
@@ -136,6 +225,7 @@ def test_invalid_timestamp_column_config_falls_back_to_default(
         )
 
         assert kwargs["relative_timestamps"] == pytest.approx([0.0, 2.0], abs=1e-9)
+        assert kwargs["constraints"]["max_requests"] == 2
 
     @pytest.mark.smoke
     def test_data_samples_truncates_after_sorting_and_preserves_constraints(
@@ -185,6 +275,7 @@ def test_non_positive_data_samples_do_not_truncate(
         )
 
         assert kwargs["relative_timestamps"] == pytest.approx([0.0, 1.0], abs=1e-9)
+        assert kwargs["constraints"]["max_requests"] == 2
 
     @pytest.mark.smoke
     def test_resolve_profile_passes_replay_specific_kwargs(self, tmp_path: Path):
diff --git a/tests/unit/data/deserializers/test_trace_synthetic.py b/tests/unit/data/deserializers/test_trace_synthetic.py
index 9a98097a4..9fbc6eefb 100644
--- a/tests/unit/data/deserializers/test_trace_synthetic.py
+++ b/tests/unit/data/deserializers/test_trace_synthetic.py
@@ -83,6 +83,127 @@ def test_honors_custom_column_names(self, tmp_path: Path, deserializer):
         assert ds["prompt_tokens_count"] == [2, 4]
         assert ds["output_tokens_count"] == [20, 40]
 
+    @pytest.mark.smoke
+    def test_generates_large_trace_prompts_from_reusable_base(
+        self, tmp_path: Path, deserializer
+    ):
+        prompt_lengths = [
+            6755,
+            7319,
+            7234,
+            2287,
+            9013,
+            6506,
+            4824,
+            3119,
+            23090,
+            3135,
+            26874,
+            10487,
+            17448,
+            6253,
+            6725,
+            13538,
+            87162,
+            6166,
+            6320,
+            2007,
+            3174,
+            3131,
+            3159,
+            6820,
+            3154,
+            9416,
+            7460,
+        ]
+        output_lengths = [
+            500,
+            490,
+            794,
+            316,
+            3,
+            3,
+            173,
+            20,
+            453,
+            19,
+            458,
+            402,
+            610,
+            3,
+            32,
+            71,
+            402,
+            24,
+            548,
+            354,
+            19,
+            23,
+            20,
+            26,
+            21,
+            145,
+            3,
+        ]
+        timestamps = [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            0.5,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            2.0,
+            2.0,
+            2.0,
+            2.0,
+        ]
+        trace = _write_trace(
+            tmp_path,
+            "\n".join(
+                (
+                    f'{{"timestamp": {timestamp}, '
+                    f'"input_length": {prompt_length}, '
+                    f'"output_length": {output_length}}}'
+                )
+                for timestamp, prompt_length, output_length in zip(
+                    timestamps, prompt_lengths, output_lengths, strict=True
+                )
+            ),
+        )
+        processor = _mock_processor()
+
+        ds = deserializer(
+            data=trace,
+            processor_factory=lambda: processor,
+            random_seed=42,
+            type_="trace_synthetic",
+        )
+
+        assert ds["prompt_tokens_count"] == prompt_lengths
+        assert ds["output_tokens_count"] == output_lengths
+        assert processor.encode.call_count <= len(prompt_lengths) + 4
+        for prompt, token_count in zip(
+            ds["prompt"], ds["prompt_tokens_count"], strict=True
+        ):
+            assert len(_mock_processor().encode(prompt)) == token_count
+
     @pytest.mark.smoke
     def test_rejects_invalid_data(self, deserializer):
         with pytest.raises(DataNotSupportedError, match="path to a trace file"):
diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
index d35c4170e..3e22bbe14 100644
--- a/tests/unit/scheduler/test_trace_replay.py
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import asyncio
-import math
 from multiprocessing import get_context
 from pathlib import Path
 
@@ -103,16 +102,15 @@ def test_initialization_and_serialization(self):
 
         assert strategy.type_ == "trace"
         assert str(strategy) == "trace@2.00"
-        assert strategy.processes_limit is None
-        assert strategy.requests_limit == 3
-
+        assert strategy.processes_limit == 1
+        assert strategy.requests_limit is None
         restored = SchedulingStrategy.model_validate(strategy.model_dump())
         assert isinstance(restored, TraceReplayStrategy)
         assert restored.relative_timestamps == [0.0, 0.5, 1.0]
         assert restored.time_scale == 2.0
 
     @pytest.mark.smoke
-    def test_next_request_time_scales_timestamps_and_exhausts_trace(self):
+    def test_next_request_time_scales_timestamps(self):
         strategy = TraceReplayStrategy(
             relative_timestamps=[0.0, 0.5, 1.0],
             time_scale=2.0,
@@ -125,11 +123,38 @@ def test_next_request_time_scales_timestamps_and_exhausts_trace(self):
         strategy.init_processes_start(1000.0)
 
         async def run():
-            return [await strategy.next_request_time(0) for _ in range(4)]
+            return [await strategy.next_request_time(0) for _ in range(3)]
 
         assert asyncio.run(run()) == pytest.approx(
-            [1000.0, 1001.0, 1002.0, math.inf], abs=1e-6
+            [1000.0, 1001.0, 1002.0], abs=1e-6
+        )
+
+    @pytest.mark.smoke
+    def test_next_request_time_parks_when_trace_exhausted(self):
+        strategy = TraceReplayStrategy(
+            relative_timestamps=[0.0, 0.5],
+            time_scale=1.0,
+        )
+        strategy.init_processes_timings(
+            worker_count=1,
+            max_concurrency=10,
+            mp_context=get_context(),
         )
+        strategy.init_processes_start(1000.0)
+
+        async def run():
+            # Consume the 2 valid slots
+            await strategy.next_request_time(0)
+            await strategy.next_request_time(0)
+            # The 3rd call parks; should raise CancelledError when cancelled
+            task = asyncio.create_task(strategy.next_request_time(0))
+            await asyncio.sleep(0.05)
+            assert not task.done(), "expected to be parked, not resolved"
+            task.cancel()
+            with pytest.raises(asyncio.CancelledError):
+                await task
+
+        asyncio.run(run())
 
     @pytest.mark.smoke
     def test_empty_trace_has_no_request_limit_and_uses_start_time(self):

From b981ce9fbff7be034cb1ee26ca2b8535ce371f5d Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Sat, 9 May 2026 10:24:38 +0200
Subject: [PATCH 26/27] fix ci with precommit

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/data/deserializers/trace_synthetic.py | 3 +--
 src/guidellm/scheduler/strategies.py               | 5 +++--
 tests/unit/benchmark/test_replay_profile.py        | 8 ++------
 tests/unit/scheduler/test_trace_replay.py          | 4 +---
 4 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/guidellm/data/deserializers/trace_synthetic.py b/src/guidellm/data/deserializers/trace_synthetic.py
index b732ff5b3..f4cff0bf5 100644
--- a/src/guidellm/data/deserializers/trace_synthetic.py
+++ b/src/guidellm/data/deserializers/trace_synthetic.py
@@ -94,8 +94,7 @@ def _create_prompt(
             f"{prompt_tokens_count} tokens from generated base tokens"
         )
 
-    prompt = _decode_prompt(processor, prompt_token_ids)
-    return prompt
+    return _decode_prompt(processor, prompt_token_ids)
 
 
 def _load_trace_rows(
diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
index 33058000f..e47daf1ea 100644
--- a/src/guidellm/scheduler/strategies.py
+++ b/src/guidellm/scheduler/strategies.py
@@ -698,8 +698,9 @@ def __str__(self) -> str:
 
     @property
     def processes_limit(self) -> PositiveInt | None:
-        # TODO: Support multi-process trace replay once each scheduled timestamp
-        # is bound to its request before workers compete for queue items.
+        # Trace replay is currently constrained to one process until each
+        # scheduled timestamp is bound to its request before workers compete
+        # for queue items.
         return 1
 
     @property
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index 215fa302e..50aea23e7 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -117,9 +117,7 @@ def test_custom_timestamp_column_via_data_args(self, tmp_path: Path):
         assert kwargs["constraints"]["max_requests"] == 3
 
     @pytest.mark.smoke
-    def test_large_bursty_trace_sets_default_request_constraint(
-        self, tmp_path: Path
-    ):
+    def test_large_bursty_trace_sets_default_request_constraint(self, tmp_path: Path):
         prompt_lengths = [
             6755,
             7319,
@@ -198,9 +196,7 @@ def test_large_bursty_trace_sets_default_request_constraint(
             data=[str(trace)],
         )
 
-        assert kwargs["relative_timestamps"] == pytest.approx(
-            timestamps, abs=1e-9
-        )
+        assert kwargs["relative_timestamps"] == pytest.approx(timestamps, abs=1e-9)
         assert kwargs["constraints"]["max_requests"] == 27
 
     @pytest.mark.smoke
diff --git a/tests/unit/scheduler/test_trace_replay.py b/tests/unit/scheduler/test_trace_replay.py
index 3e22bbe14..8b6f6f7b0 100644
--- a/tests/unit/scheduler/test_trace_replay.py
+++ b/tests/unit/scheduler/test_trace_replay.py
@@ -125,9 +125,7 @@ def test_next_request_time_scales_timestamps(self):
         async def run():
             return [await strategy.next_request_time(0) for _ in range(3)]
 
-        assert asyncio.run(run()) == pytest.approx(
-            [1000.0, 1001.0, 1002.0], abs=1e-6
-        )
+        assert asyncio.run(run()) == pytest.approx([1000.0, 1001.0, 1002.0], abs=1e-6)
 
     @pytest.mark.smoke
     def test_next_request_time_parks_when_trace_exhausted(self):

From 4c0b43bf3de607f50f4c77c996315c4d48a5d7ba Mon Sep 17 00:00:00 2001
From: Vincent Gimenes <vincent.gimenes@gmail.com>
Date: Tue, 12 May 2026 10:40:44 +0200
Subject: [PATCH 27/27] enforce single trace data source in resolve_args

Signed-off-by: Vincent Gimenes <vincent.gimenes@gmail.com>
---
 src/guidellm/benchmark/profiles.py          |  8 +++++++-
 tests/unit/benchmark/test_replay_profile.py | 10 ++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
index c4a76933e..704c24e0e 100644
--- a/src/guidellm/benchmark/profiles.py
+++ b/src/guidellm/benchmark/profiles.py
@@ -365,7 +365,13 @@ def resolve_args(
     ) -> dict[str, Any]:
         _ = (rate_type, random_seed)  # unused
         data = kwargs.get("data")
-        if not data or not data[0]:
+        if not data:
+            raise ValueError("Replay profile requires data (path to trace file)")
+        if len(data) != 1:
+            raise ValueError(
+                f"ReplayProfile requires exactly one data source, received {len(data)}"
+            )
+        if not data[0]:
             raise ValueError("Replay profile requires data (path to trace file)")
         path = Path(data[0]) if isinstance(data[0], str) else data[0]
         if not path.exists():
diff --git a/tests/unit/benchmark/test_replay_profile.py b/tests/unit/benchmark/test_replay_profile.py
index 50aea23e7..6b50dbe85 100644
--- a/tests/unit/benchmark/test_replay_profile.py
+++ b/tests/unit/benchmark/test_replay_profile.py
@@ -27,6 +27,16 @@ def test_resolve_args_requires_data(self):
                 random_seed=42,
             )
 
+    @pytest.mark.smoke
+    def test_resolve_args_rejects_multiple_data_sources(self):
+        with pytest.raises(ValueError, match="exactly one data source"):
+            ReplayProfile.resolve_args(
+                rate_type="replay",
+                rate=[1.0],
+                random_seed=42,
+                data=["trace-a.jsonl", "trace-b.jsonl"],
+            )
+
     @pytest.mark.smoke
     def test_resolve_args_rejects_missing_or_empty_trace(self, tmp_path: Path):
         missing = tmp_path / "missing.jsonl"