NVIDIA-NeMo · cmunley1 · Jan 29, 2026 · Jan 29, 2026 · coderabbitai · Jan 29, 2026
@@ -39,6 +39,10 @@ class VllmSpecificArgs(TypedDict):
     # Miscellaneous top level vLLM HTTP server arguments.
     # A filepath that can be imported to register a vLLM tool parser
     tool_parser_plugin: NotRequired[str]
+    # Ensures message history over multiple turns remains constant. Useful for on-policy training, but sometimes
+    # we may want to drop previous reasoning (like Qwen3), agent context management,
+    # sliding window, or drop past environment state observations, etc
+    enforce_monotonicity: NotRequired[bool]
 
 
 class VllmConfig(GenerationConfig):

@@ -330,6 +330,8 @@ def _setup_vllm_openai_api_server(self, app: FastAPI) -> FastAPI:
             openai_serving_models_kwargs["model_config"] = model_config
         openai_serving_models = OpenAIServingModels(**openai_serving_models_kwargs)
 
+        enforce_monotonicity = self.cfg["vllm_cfg"].get("enforce_monotonicity", True)
-        enforce_monotonicity = self.cfg["vllm_cfg"].get("enforce_monotonicity", True)
+        enforce_monotonicity = self.cfg["vllm_cfg"]["enforce_monotonicity"]
-        enforce_monotonicity = self.cfg["vllm_cfg"].get("enforce_monotonicity", True)
+        enforce_monotonicity = self.cfg["vllm_cfg"]["enforce_monotonicity"]
+
         class NeMoRLOpenAIChatRequestMixin:
             def model_post_init(self, context):
                 # NeMo-Gym specific processing. This is just how NeMo-Gym returns the extra token information.
@@ -384,6 +386,9 @@ async def _preprocess_chat(
                     add_special_tokens,
                 )
 
+                if not enforce_monotonicity:
+                    return res
+
                 if request.required_prefix_token_ids is None:
                     return res