diff --git a/nemo_rl/models/generation/vllm/config.py b/nemo_rl/models/generation/vllm/config.py index 857ed177c2..7e8d76567f 100644 --- a/nemo_rl/models/generation/vllm/config.py +++ b/nemo_rl/models/generation/vllm/config.py @@ -39,6 +39,10 @@ class VllmSpecificArgs(TypedDict): # Miscellaneous top level vLLM HTTP server arguments. # A filepath that can be imported to register a vLLM tool parser tool_parser_plugin: NotRequired[str] + # Ensures message history over multiple turns remains constant. Useful for on-policy training, but sometimes + # we may want to drop previous reasoning (like Qwen3), agent context management, + # sliding window, or drop past environment state observations, etc + enforce_monotonicity: NotRequired[bool] class VllmConfig(GenerationConfig): diff --git a/nemo_rl/models/generation/vllm/vllm_worker_async.py b/nemo_rl/models/generation/vllm/vllm_worker_async.py index 0e4ea5cdeb..5a17a3f68c 100644 --- a/nemo_rl/models/generation/vllm/vllm_worker_async.py +++ b/nemo_rl/models/generation/vllm/vllm_worker_async.py @@ -330,6 +330,8 @@ def _setup_vllm_openai_api_server(self, app: FastAPI) -> FastAPI: openai_serving_models_kwargs["model_config"] = model_config openai_serving_models = OpenAIServingModels(**openai_serving_models_kwargs) + enforce_monotonicity = self.cfg["vllm_cfg"].get("enforce_monotonicity", True) + class NeMoRLOpenAIChatRequestMixin: def model_post_init(self, context): # NeMo-Gym specific processing. This is just how NeMo-Gym returns the extra token information. @@ -384,6 +386,9 @@ async def _preprocess_chat( add_special_tokens, ) + if not enforce_monotonicity: + return res + if request.required_prefix_token_ids is None: return res